# Using 🤗 PEFT & bitsandbytes to finetune a LoRa checkpoint




In [6]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [13]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-32GB (UUID: GPU-a0481df7-b5c0-3380-57be-aed485478630)


### Setup the model

In [18]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM


model_name = 'distilbert/distilgpt2' #gpt2

model_name = 'skt/kogpt2-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # load_in_8bit=True, # 표시!
    # device_map='auto', # 표시!
)

# 위에 표시한거랑 주석처리한거랑 안한거랑 아래 파라미터는 똑같음.
# trainable params: 589824 || all params: 125753856 || trainable%: 0.46903054805730965

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
# import torch
# import torch.nn as nn
# import bitsandbytes as bnb
# from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

# model_name = 'skt/kogpt2-base-v2'
# quantization_config = BitsAndBytesConfig(enabled=True, mode='mixed8', reduce_dtype=True)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=quantization_config,
#     device_map='auto',
# )
# # trainable params: 589824 || all params: 83286528 || trainable%: 0.7081865628976634

### Freezing the original weights


In [23]:
for name,param in model.named_parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    print(name)
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

base_model.model.transformer.h.0.ln_1.weight
base_model.model.transformer.h.0.ln_1.bias
base_model.model.transformer.h.0.attn.c_attn.base_layer.bias
base_model.model.transformer.h.0.attn.c_proj.bias
base_model.model.transformer.h.0.ln_2.weight
base_model.model.transformer.h.0.ln_2.bias
base_model.model.transformer.h.0.mlp.c_fc.bias
base_model.model.transformer.h.0.mlp.c_proj.bias
base_model.model.transformer.h.1.ln_1.weight
base_model.model.transformer.h.1.ln_1.bias
base_model.model.transformer.h.1.attn.c_attn.base_layer.bias
base_model.model.transformer.h.1.attn.c_proj.bias
base_model.model.transformer.h.1.ln_2.weight
base_model.model.transformer.h.1.ln_2.bias
base_model.model.transformer.h.1.mlp.c_fc.bias
base_model.model.transformer.h.1.mlp.c_proj.bias
base_model.model.transformer.h.2.ln_1.weight
base_model.model.transformer.h.2.ln_1.bias
base_model.model.transformer.h.2.attn.c_attn.base_layer.bias
base_model.model.transformer.h.2.attn.c_proj.bias
base_model.model.transformer.h.2.ln

AttributeError: 'GPT2LMHeadModel' object has no attribute 'enable_input_require_grads'

### Setting up the LoRa Adapters

In [24]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [25]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=64, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 589824 || all params: 125753856 || trainable%: 0.46903054805730965


In [26]:
for name,p in model.named_parameters():
  if p.requires_grad:
    print(name)
    # break

base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight
base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight
base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight
base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight
base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight
base_model.model.base_model.model.transformer.h.5.attn.c_attn.lora_A.default.weight
base_model.model.base_model.model.transformer.h.5.attn.c_attn.lora_B.default

## Data

In [7]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data


DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [10]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

data['train'] = data['train'].map(merge_columns)
data['train']["prediction"][:5]
# data

Map: 100%|██████████| 2508/2508 [00:00<00:00, 13029.48 examples/s]


["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

In [11]:
# 'train' 데이터셋에서 'quote' 열 삭제
data['train'] = data['train'].remove_columns('quote')
data['train'] = data['train'].remove_columns('tags')
data['train'] = data['train'].remove_columns('author')

In [13]:
data

DatasetDict({
    train: Dataset({
        features: ['prediction'],
        num_rows: 2508
    })
})

In [14]:
data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map: 100%|██████████| 2508/2508 [00:00<00:00, 14230.19 examples/s]


In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

# new dataset 적용
## 1. query 2 query

In [27]:
# New dataset
from datasets import load_dataset
data_path = 'neeva/query2query_evaluation'

dataset = load_dataset(data_path)
type(dataset),type(dataset['test'])
dataset['test'][0]

def merge_columns(example):
    example["prediction"] = example["query1 "] + ", " + str(example["query2"])
    return example

dataset['test'] = dataset['test'].map(merge_columns)
dataset['test'] = dataset['test'].remove_columns('query1 ')
dataset['test'] = dataset['test'].remove_columns('query2')
dataset['test'] = dataset['test'].remove_columns('rating')
print(dataset)

dataset = dataset.map(lambda samples: tokenizer(samples['prediction']), batched=True)
dataset['test'][0]

DatasetDict({
    test: Dataset({
        features: ['prediction'],
        num_rows: 1000
    })
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'prediction': 'what is a mole, ground moles',
 'input_ids': [18896,
  446,
  10272,
  739,
  9969,
  13612,
  11849,
  31816,
  387,
  15317,
  10689,
  16799,
  11849,
  10602,
  9792],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## 2. query 2 doc

In [17]:
from datasets import load_dataset
data_path = 'intfloat/query2doc_msmarco'

dataset = load_dataset(data_path)
print(dataset)

def merge_columns(example):
    example["prediction"] = 'question : '+ example["query"] + " answer : " + str(example["pseudo_doc"])
    return example

def remove_col(dataset,name):
  dataset[name] = dataset[name].map(merge_columns) # make list
  dataset[name] = dataset[name].remove_columns('query')
  dataset[name] = dataset[name].remove_columns('pseudo_doc')
  dataset[name] = dataset[name].remove_columns('query_id')

  dataset[name] = dataset[name].map(lambda samples: tokenizer(samples['prediction']), batched=True)

remove_col(dataset,'train')
remove_col(dataset,'validation')
remove_col(dataset,'test')
dataset

Downloading builder script: 100%|██████████| 3.90k/3.90k [00:00<00:00, 10.3MB/s]
Downloading readme: 100%|██████████| 2.27k/2.27k [00:00<00:00, 11.1MB/s]
Downloading data: 100%|██████████| 241M/241M [00:12<00:00, 19.0MB/s]
Downloading data: 100%|██████████| 3.36M/3.36M [00:01<00:00, 3.01MB/s]
Downloading data: 100%|██████████| 3.24M/3.24M [00:01<00:00, 2.90MB/s]
Downloading data: 100%|██████████| 22.0k/22.0k [00:00<00:00, 42.1MB/s]
Downloading data: 100%|██████████| 27.0k/27.0k [00:00<00:00, 43.7MB/s]
Downloading data files: 100%|██████████| 5/5 [00:20<00:00,  4.01s/it]


{'train': '/data/ephemeral/home/.cache/huggingface/datasets/downloads/363e1af87c516ca324da4b77663f61b96190535e3dd47921a90800abc8e4b8c5', 'dev': '/data/ephemeral/home/.cache/huggingface/datasets/downloads/b594b31a1055055615af076fdf73c320cdd70988c5e136b7ff7e6b777f9ab2f0', 'test': '/data/ephemeral/home/.cache/huggingface/datasets/downloads/bd74d174629b3e7ccba1407f4fa051e241d243244c957aaa1d5c08becb3d72e4', 'trec_dl2019': '/data/ephemeral/home/.cache/huggingface/datasets/downloads/67aadce3215b4c9bc41440220036f87d10f8a09ed714b3c2ee7fabfefad80a51', 'trec_dl2020': '/data/ephemeral/home/.cache/huggingface/datasets/downloads/0613d96bd7f92fd7f422316998ecaf2536dd08c1ac5835ce288706af00530d28'}


Generating train split: 502939 examples [00:18, 26880.27 examples/s]
Generating validation split: 6980 examples [00:00, 26928.23 examples/s]
Generating test split: 6837 examples [00:00, 27135.10 examples/s]
Generating trec_dl2019 split: 43 examples [00:00, 12894.48 examples/s]
Generating trec_dl2020 split: 54 examples [00:00, 13879.06 examples/s]


DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 502939
    })
    validation: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 6980
    })
    test: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 6837
    })
    trec_dl2019: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 43
    })
    trec_dl2020: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 54
    })
})


Map: 100%|██████████| 502939/502939 [00:33<00:00, 15213.31 examples/s]
Map: 100%|██████████| 502939/502939 [00:51<00:00, 9708.85 examples/s] 
Map: 100%|██████████| 6980/6980 [00:00<00:00, 15683.35 examples/s]
Map: 100%|██████████| 6980/6980 [00:00<00:00, 8420.65 examples/s]
Map: 100%|██████████| 6837/6837 [00:00<00:00, 15364.50 examples/s]
Map: 100%|██████████| 6837/6837 [00:00<00:00, 10134.45 examples/s]


DatasetDict({
    train: Dataset({
        features: ['prediction', 'input_ids', 'attention_mask'],
        num_rows: 502939
    })
    validation: Dataset({
        features: ['prediction', 'input_ids', 'attention_mask'],
        num_rows: 6980
    })
    test: Dataset({
        features: ['prediction', 'input_ids', 'attention_mask'],
        num_rows: 6837
    })
    trec_dl2019: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 43
    })
    trec_dl2020: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 54
    })
})

# 3. query 2 docs mix : mydataset

In [28]:
import pandas as pd
train = pd.read_csv('../open/train.csv')
test = pd.read_csv('../open/test.csv')

query_1 = list(train.질문_1)
query_2 = list(train.질문_2)

answer_1 = list(train.답변_1)
answer_2 = list(train.답변_2)
answer_3 = list(train.답변_3)
answer_4 = list(train.답변_4)
answer_5 = list(train.답변_5)

queyr1개와 answer1개 섞기.

In [29]:
import pandas as pd
from datasets import DatasetDict, Dataset, Features
from datasets import DatasetDict

# Read the training data
train = pd.read_csv("../open/train.csv")

# Extract queries and answers
queries = train["질문_1"]
answers = []

for i in range(len(train)):
    row_answers = train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
    for answer in row_answers:
        
        ######### Caution!!!!! #########
        #여기가 리스트이면 안됨!!!!!!!!
        # answers.append([queries.iloc[i]+' '+answer]) 
        answers.append(queries.iloc[i]+' '+answer) 
        
queries = train["질문_2"]
for i in range(len(train)):
    row_answers = train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
    for answer in row_answers:
        
        # answers.append({queries.iloc[i]+' '+answer})
        # answers.append([queries.iloc[i]+' '+answer]
        answers.append(queries.iloc[i]+' '+answer) 
        # answers.append({"문": queries.iloc[i], "답": answer})
        
# Create the Dataset
dataset = Dataset.from_dict({"prediction": answers})

# Print the Dataset
print(dataset)
dataset_dict = DatasetDict({"train": dataset})

print()
# 비슷하게 만들었다.
"""
DatasetDict({
    train: Dataset({
        features: ['prediction', 'input_ids', 'attention_mask'],
        num_rows: 502939
    })
})
"""
print(dataset_dict)

# 실패
def tokenizers_on(dataset,name):
  dataset[name] = dataset[name].map(lambda samples: tokenizer(samples['prediction']), batched=True)
  return dataset
# dataset_dict = remove_col(dataset_dict,'train')
# dataset_dict


dataset_dict = dataset_dict.map(lambda samples: tokenizer(samples['prediction']), batched=True)
dataset_dict

print(type(dataset_dict),type(dataset))

Dataset({
    features: ['prediction'],
    num_rows: 6440
})

DatasetDict({
    train: Dataset({
        features: ['prediction'],
        num_rows: 6440
    })
})


Map:   0%|          | 0/6440 [00:00<?, ? examples/s]

<class 'datasets.dataset_dict.DatasetDict'> <class 'datasets.arrow_dataset.Dataset'>


# 일단 완성!!!!!!!!

#### 다른 데이터 형식을 빌려와서 추가하기!

In [120]:
dataset_dict['train'][0]

{'prediction': '면진장치가 뭐야? 면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 격리장치입니다.'}

In [121]:
dataset0['test'][0]

{'prediction': 'what is a mole, ground moles'}

In [95]:
# New dataset
from datasets import load_dataset
data_path = 'neeva/query2query_evaluation'

dataset0 = load_dataset(data_path)
print(type(dataset0),type(dataset0['test']))
print(dataset0['test'])

def merge_columns(example):
    example["prediction"] = example["query1 "] + ", " + str(example["query2"])
    return example

dataset0['test'] = dataset0['test'].map(merge_columns)
dataset0['test'] = dataset0['test'].remove_columns('query1 ')
dataset0['test'] = dataset0['test'].remove_columns('query2')
dataset0['test'] = dataset0['test'].remove_columns('rating')
print(dataset0)

dataset0['train'] = dataset


<class 'datasets.dataset_dict.DatasetDict'> <class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['query1 ', 'query2', 'rating'],
    num_rows: 1000
})
DatasetDict({
    test: Dataset({
        features: ['prediction'],
        num_rows: 1000
    })
})


# 쿼리 2개 와 답변 2개 섞기.

In [17]:
i = 3
train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()

['철근철골콘크리트는 철골과 철근, 그리고 콘크리트를 함께 사용하는 건축 구조입니다. 철골은 강하고 가벼운 소재로, 높은 내구성과 강도를 가지고 있어 대규모 공간이나 강성이 요구되는 대형 고층건물에 적합합니다. 그러나 화재에 취약하고 녹이 슬면 강성이 떨어지는 단점이 있습니다. 이를 보완하기 위해 철근과 콘크리트를 섞어 사용하는 방식이 개발되었습니다. 철근콘크리트는 철근이 인장력을, 콘크리트가 압축력을 담당하여 강성을 가지며 콘크리트의 열전도가 낮은 재료상 특징을 이용해 서로의 단점을 보완하도록 결합한 구조입니다. 내화성, 내구성, 내진성능이 뛰어나지만 자중이 크고 시공과정이 복잡하며 공사기간이 길고 균일한 시공이 불가능하다는 단점이 있습니다. 철근철골콘크리트는 이 두 가지 구조의 장점을 살려 만든 것으로, 철골 뼈대로 구조를 세우고 그 주변을 철근과 콘크리트로 감싸거나, 철근과 콘크리트로 구조를 만들고 그 위에 철골을 올리는 형태로 만들어집니다. 이렇게 함으로써 철골의 내구성과 강도를 높이면서도 화재에 대한 안전성을 높일 수 있습니다.',
 '철골철근콘크리트 구조는 건축물을 지탱하는 주요 구조물인 철골과 철근, 그리고 콘크리트를 함께 사용하는 건축 구조입니다. 이 구조는 철골의 강도와 내구성, 철근의 인장력, 그리고 콘크리트의 압축력을 조합하여 건축물에 대한 강성과 내구성을 향상시킵니다.   기존 철골 구조의 화재에 취약하고 녹이 스면 강성이 떨어지는 단점을 보완하기 위해 개발된 구조로, 내화성, 내구성, 내진성능이 뛰어나지만 자중이 크고 시공과정이 복잡하며 공사기간이 길고 균일한 시공이 어렵다는 단점 또한 존재합니다.   이러한 구조는 철골을 뼈대로 구조를 세우고 그 주변을 철근과 콘크리트로 감싸거나, 철근과 콘크리트로 구조를 만들고 그 위에 철골을 올리는 형태로 만들어집니다. 이로써 철골의 내구성과 강도를 높이면서도 화재에 대한 안전성을 높일 수 있습니다.',
 '철골철근 콘크리트 구조는 건축물을 지탱하기 위한 구조물에서 일반적으로 사용되는 방식으로, 철

In [42]:
import pandas as pd
from datasets import DatasetDict, Dataset, Features

# Read the training data
train = pd.read_csv("../open/train.csv")

# Extract queries and answers
queries = train["질문_1"]
answers = []

for i in range(len(train)):
    # list 형식이다.
    row_answers = train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
    for answer in row_answers:
        answers.append(queries.iloc[i]+' '+answer)
        
queries = train["질문_2"]
for i in range(len(train)):
    row_answers = train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
    for answer in row_answers:
        answers.append(queries.iloc[i]+' '+answer)
        # answers.append({"문": queries.iloc[i], "답": answer})

for i in range(len(train)):
    row_answers_1 = train.iloc[i][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
    for j in range(len(train)):
        row_answers_2 = train.iloc[j][["답변_1", "답변_2", "답변_3", "답변_4", "답변_5"]].dropna().tolist()
        row_answers = row_answers_1[i]
queries = 
# Create the Dataset
dataset = Dataset.from_dict({"data": answers})

# Print the Dataset
print(dataset)
dataset['data']

SyntaxError: invalid syntax (2797452991.py, line 29)

In [32]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Training

In [33]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 토크나이저에 패딩 토큰 설정 (eos_token 사용)
tokenizer.pad_token = tokenizer.eos_token

# 모델 토크나이저의 변경 사항에 따라 모델의 토큰 임베딩 크기 조정
model.resize_token_embeddings(len(tokenizer))


# 트레이닝을 위한 설정
training_args = TrainingArguments(
    num_train_epochs=5, ## Epochs
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    max_steps=500,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir='outputs',
    evaluation_strategy='epoch',  # 에포크마다 평가
    logging_dir='./logs',  # 로그 디렉토리 설정
    report_to="tensorboard",  # TensorBoard에 로그 보고
)


# 데이터 콜레이터 설정 (마스크된 언어 모델링이 아닌 경우 mlm=False 사용)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# trainer = WeightedLosstrainer(

# )
# 트레이너 설정 및 트레이닝 시작
trainer = Trainer(
    model=model,
    args=training_args,
    # train_dataset=data['train'],
    train_dataset=dataset_dict['train'],
    # train_dataset=dataset['train'],
    # eval_dataset=dataset['validation'],  # 평가 데이터셋으로 dataset['valid'] 사용
    data_collator=data_collator,
    push_to_hub=True,
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
cannot import name 'is_torch_mps_available' from 'transformers.utils' (/data/ephemeral/home/JH/Vsearch/lib/python3.10/site-packages/transformers/utils/__init__.py)

## Share adapters on the 🤗 Hub

In [None]:
model.push_to_hub("Soran/gpt2_lora_query2query_custom",
                  use_auth_token=True,
                  commit_message="query2query training",
                  private=True)



CommitInfo(commit_url='https://huggingface.co/Soran/gpt2_lora_query2query/commit/9e76547caeedea355f87dc2001da8b0581c6c578', commit_message='query2query training', commit_description='', oid='9e76547caeedea355f87dc2001da8b0581c6c578', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "Soran/gpt2_lora_query2query"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

TypeError: 'LoraConfig' object does not support item assignment

In [None]:
origin_name = 'distilbert/distilgpt2'
model = AutoModelForCausalLM.from_pretrained(origin_name, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(origin_name)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


adapter_model.safetensors:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

## Inference

In [None]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=20)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




 “Training models with PEFT and LoRa is cool” ->: ㅠㅠㅠㅠㅠㅠ�


In [None]:
texts = ['hi my name is','what is your']
batch = tokenizer(texts[1], return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=10)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




 what is your favorite game?









In [None]:
tokenizer.decode(output_tokens[0],skip_special_tokens=False)

'i like cute little girls.\n\n\n\n\n\n\n'

In [None]:
# 모델의 첫 번째 파라미터의 데이터 타입 확인
first_param_dtype = next(model.parameters()).dtype
print(f"Data type of the first parameter: {first_param_dtype}")


Data type of the first parameter: torch.float16


In [None]:
from datasets import load_dataset
data_path = 'neeva/query2query_evaluation'

dataset = load_dataset(data_path)
type(dataset),type(dataset['test'])
dataset['test'][0]

def merge_columns(example):
    example["prediction"] = example["query1 "] + ", " + str(example["query2"])
    return example

dataset['test'] = dataset['test'].map(merge_columns)
dataset['test'] = dataset['test'].remove_columns('query1 ')
dataset['test'] = dataset['test'].remove_columns('query2')
dataset['test'] = dataset['test'].remove_columns('rating')
print(dataset)

dataset = dataset.map(lambda samples: tokenizer(samples['prediction']), batched=True)
dataset['test'][0]

DatasetDict({
    test: Dataset({
        features: ['prediction'],
        num_rows: 1000
    })
})


{'prediction': 'what is a mole, ground moles',
 'input_ids': [10919, 318, 257, 9411, 11, 2323, 285, 4316],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
from datasets import load_dataset
data_path = 'intfloat/query2doc_msmarco'

dataset = load_dataset(data_path)
print(dataset)

def merge_columns(example):
    example["prediction"] = 'question : '+ example["query"] + " answer : " + str(example["pseudo_doc"])
    return example

def remove_col(dataset,name):
  dataset[name] = dataset[name].map(merge_columns)
  dataset[name] = dataset[name].remove_columns('query')
  dataset[name] = dataset[name].remove_columns('pseudo_doc')
  dataset[name] = dataset[name].remove_columns('query_id')

  dataset[name] = dataset[name].map(lambda samples: tokenizer(samples['prediction']), batched=True)

remove_col(dataset,'train')
remove_col(dataset,'validation')
remove_col(dataset,'test')
dataset

DatasetDict({
    train: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 502939
    })
    validation: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 6980
    })
    test: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 6837
    })
    trec_dl2019: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 43
    })
    trec_dl2020: Dataset({
        features: ['query_id', 'query', 'pseudo_doc'],
        num_rows: 54
    })
})


Map:   0%|          | 0/502939 [00:00<?, ? examples/s]

Map:   0%|          | 0/502939 [00:00<?, ? examples/s]

Map:   0%|          | 0/6837 [00:00<?, ? examples/s]

Map:   0%|          | 0/6837 [00:00<?, ? examples/s]