In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from fine_tune_util import compute_metrics, preprocess_logits_for_metrics
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory)

NVIDIA GeForce RTX 3080 Laptop GPU
8589410304


In [3]:
##################################### All Configuration ###################################
# This should be the only part of this code that is getting modified                      #
model_name = "meta-llama_Llama-3.2-3B" #"mistralai_Mistral-7B-v0_1"                                                  #
model_path = f"../Local Models/{model_name}"                                              #                                                      
                                                                                          #
from_pretrained_params_dict = {                                                           #
    "pretrained_model_name_or_path" : model_path,                                         #
    #"load_in_8bit":True,                                                                 #
    "device_map":"auto",                                                                  #
    "torch_dtype": torch.float16                                                          #
}                                                                                         #
                                                                                          #
lora_config_params_dict = {                                                               #
    "lora_alpha":4,#16,                                                                      #
    "lora_dropout":0.1,                                                                   #
    "r":8,#64,                                                                               #
    "bias":"none",                                                                        #
    "task_type":TaskType.CAUSAL_LM,                                                       #
    #"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"     #
     #               "down_proj", "lm_head"]                                               #
}                                                                                         #
                                                                                          #
quantization_params_dict = {                                                              #
                                                                                          #
}                                                                                         #
                                                                                          #
tokenizer_params_dict = {                                                                 #
  "truncation":False,
  "padding": False,
  "return_tensors": None,                                                                      #
  #"max_length":1024                                                                       #
}                                                                                         #
                                                                                          #
cur_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")                               #
checkpoint_dir = f"../../fine_tuned_llms/{model_name}/checkpoints/{cur_datetime}/"        #
metrics_dir = f"{checkpoint_dir}/metrics"                                                 #                                                                                          
                                                                                          #
training_args_dict = {                                                                    #                 
  "output_dir":checkpoint_dir,                                                            #                              
  "per_device_train_batch_size":2, # using A40 gpu, not sure if rivanna can handle more,  #                                                                                         
                                    # sticking with this for now                          #                                                                
  "per_device_eval_batch_size":2,                                                         #                                 
  #"num_train_epochs":3,                                                                  #                        
  "max_steps": 101,                                                                       #                   
  "evaluation_strategy":"steps",                                                          #                                
  "save_strategy":"steps",                                                                #                          
  "eval_steps":10,                                                                        #                  
  "save_steps":10,                                                                        #                  
  "load_best_model_at_end":True,                                                          #                                
  "metric_for_best_model":"perplexity",   # Select the best model based on perplexity     #                                                                                     
  "greater_is_better":False,              # Lower perplexity is better                    #                                                                      
  "logging_dir":metrics_dir,                                                              #                            
  "fp16":True,                                                                            #              
  #save_total_limit=3, # only keeping best 3                                              #                                            
}                                                                                         # 
###########################################################################################

In [4]:
#load dataset

seed = 210

data = pd.read_csv("../../data/Cleaned Data/CNN_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.1, random_state=seed)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=seed)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=seed)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [5]:
# set up model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(**from_pretrained_params_dict)
peft_config =  LoraConfig(**lora_config_params_dict)
model = get_peft_model(model, peft_config)
print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_




In [6]:
#set up tokenizer
tokenizer.pad_token = tokenizer.eos_token  # maybe this instead?: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# using EOS should be fine since we want to talk like youtube comments
tokenizer.padding_side = "right"
def tokenize_function(examples):
    return tokenizer(examples["text"], **tokenizer_params_dict) # we don't do padding here, we let the data collater handle it

In [7]:
# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for causal language modeling
#tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["input_ids"]})
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])
print(tokenized_datasets)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

Map: 100%|██████████| 115390/115390 [00:05<00:00, 22914.62 examples/s]
Map: 100%|██████████| 24726/24726 [00:01<00:00, 24365.13 examples/s]
Map: 100%|██████████| 24727/24727 [00:01<00:00, 21938.82 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 115390
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 24726
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 24727
    })
})





In [8]:
#print(tokenized_datasets['train']['labels'][0])

In [9]:
# set up trainer
training_args = TrainingArguments(**training_args_dict)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics   
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
