In [1]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1,target_modules=["q_proj", "v_proj"])


tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


In [2]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,472,832 || all params: 9,246,178,816 || trainable%: 0.0484


In [3]:
from torch.utils.data import Dataset
import pandas as pd
class CustomDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=4096):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = f"<start_of_turn>user\n{row['title']}<end_of_turn>\n<start_of_turn>model\n{row['reply1_content']}<end_of_turn>"
        
        # 添加padding和truncation参数
        encodings = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",  # 添加padding
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encodings['input_ids'].squeeze(),
            "attention_mask": encodings['attention_mask'].squeeze(),  # 添加attention_mask
            "labels": encodings['input_ids'].squeeze()
        }

In [4]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="models/bigscience/gemma-2-9b-it-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [5]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=CustomDataset(csv_file="dataset/merged_character_guides.csv", tokenizer=tokenizer),
    eval_dataset=CustomDataset(csv_file="dataset/merged_character_guides.csv", tokenizer=tokenizer),
    tokenizer=tokenizer,
    
)

trainer.train()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [13]:
print(trainer)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,
evaluation_s

In [12]:
input_text = "中国最好的大学是哪一所"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=1000)
print(tokenizer.decode(outputs[0]))

<bos>中国最好的大学是哪一所？<end_of_turn>
<start_of_turn>model
nan<end_of_turn>
<start_of_turn>model
nan{我，我，我，我，我感觉我感觉我感觉我感觉我感觉我感觉我感觉我感觉我感觉我，我感觉这套套的，我感觉我感觉这套套套，我感觉我感觉我感觉我，我感觉我，我感觉我，我感觉我，我感觉我，我感觉我，我感觉我，我感觉我，我感觉我，我感觉这帖子，我感觉这帖子，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我感觉自己，我感觉自己，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我可可，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我可可，我，我可行，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我，我可莉，我可可，我可莉，我，我，我，我，我，我，我，我，我，我可可，我，我可可，我，我可莉，我，我，我可莉，我，我可莉，我可可，我，我可可，我，我可莉，我，我可莉，我，我，我，我，我，我可莉，我，我，我，我，我，我，我，我，我，我，我可莉，我，我，我可莉，我，我，我，我，我，我感觉我，我可莉，我可莉，我可可，我，我可行，我，我可可，我可可，我可达，我，我可可，我可莉，我可莉，我可可就当个个个，我，我感觉我，我感觉我，我可达，我感觉我，我可不能用，我，我可可，我，我可可，我可达，我可可，我，我可可，我，