In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from fine_tune_util import compute_metrics, preprocess_logits_for_metrics, token_length_histogram
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory)

NVIDIA A40
47608692736


In [3]:
##################################### All Configuration ###################################
# This should be the only part of this code that is getting modified                      #
model_name = "mistralai_Mistral-7B-v0_1"     #"meta-llama_Llama-3.2-3B" #                                              #
model_path = f"../Local Models/{model_name}"                                              #                                                      
                                                                                          #
from_pretrained_params_dict = {                                                           #
    "pretrained_model_name_or_path" : model_path,                                         #
    #"load_in_8bit":True,                                                                 #
    "device_map":"auto",                                                                  #
    "torch_dtype": torch.float16                                                          #
}                                                                                         #
                                                                                          #
lora_config_params_dict = {                                                               #
    "lora_alpha":4,#16,                                                                      #
    "lora_dropout":0.1,                                                                   #
    "r":8,#64,                                                                               #
    "bias":"none",                                                                        #
    "task_type":TaskType.CAUSAL_LM,                                                       #
    #"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"     #
     #               "down_proj", "lm_head"]                                               #
}                                                                                         #
                                                                                          #
quantization_params_dict = {                                                              #
                                                                                          #
}                                                                                         #
                                                                                          #
tokenizer_params_dict = {                                                                 #
  "truncation":True,
  "padding": True,
  "return_tensors": None,                                                                      #
  "max_length":512                                                                       #
}                                                                                         #
                                                                                          #
cur_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")                               #
checkpoint_dir = f"../../fine_tuned_llms/{model_name}/checkpoints/{cur_datetime}"        #
metrics_dir = f"../../fine_tuned_llms/{model_name}/logs/{cur_datetime}"                                              #                                                                                          
                                                                                          #
training_args_dict = {                                                                    #                 
  "output_dir":checkpoint_dir,                                                            #                              
  "per_device_train_batch_size":8, # using A40 gpu, not sure if rivanna can handle more,  #                                                                                         
                                    # sticking with this for now                          #                                                                
  "per_device_eval_batch_size":4,                                                         #                                 
  #"num_train_epochs":3,                                                                  #                        
  "max_steps": 20,                                                                       #                   
  "evaluation_strategy":"steps",                                                          #                                
  "save_strategy":"steps",                                                                #                          
  "eval_steps":10,                                                                        #                  
  "save_steps":10,                                                                        #                  
  "load_best_model_at_end":True,                                                          #                                
  "metric_for_best_model":"perplexity",   # Select the best model based on perplexity     #                                                                                     
  "greater_is_better":False,              # Lower perplexity is better                    #                                                                      
  "logging_dir":metrics_dir,   
  "logging_strategy": "steps" , 
  "logging_steps": 10,                                                         #                            
  "fp16":True,                                                                            #              
  #save_total_limit=3, # only keeping best 3                                              #                                            
}                                                                                         # 
###########################################################################################

In [4]:
#load dataset

seed = 210

data = pd.read_csv("../../data/Cleaned Data/CNN_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.07, random_state=seed)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=seed)
val_comments, test_comments = train_test_split(test_comments, test_size=0.9, random_state=seed)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})




In [5]:
# set up model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(**from_pretrained_params_dict)
peft_config =  LoraConfig(**lora_config_params_dict)
model = get_peft_model(model, peft_config)
print(model)

Loading checkpoint shards:  50%|█████     | 1/2 [00:16<00:16, 16.84s/it]

In [None]:
#set up tokenizer
tokenizer.pad_token = tokenizer.eos_token  # maybe this instead?: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# using EOS should be fine since we want to talk like youtube comments
tokenizer.padding_side = "right"
def tokenize_function(examples):
    return tokenizer(examples["text"], **tokenizer_params_dict) # we don't do padding here, we let the data collater handle it

In [None]:
# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for causal language modeling
#tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["input_ids"]})
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])
print(tokenized_datasets)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 80773/80773 [00:17<00:00, 4720.13 examples/s]
Map: 100%|██████████| 3461/3461 [00:00<00:00, 5079.66 examples/s]
Map: 100%|██████████| 31156/31156 [00:06<00:00, 4653.15 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 80773
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 31156
    })
})





In [None]:
'''token_length_histogram(tokenized_datasets, 'train')
token_length_histogram(tokenized_datasets, 'validation')
token_length_histogram(tokenized_datasets, 'test')'''

"token_length_histogram(tokenized_datasets, 'train')\ntoken_length_histogram(tokenized_datasets, 'validation')\ntoken_length_histogram(tokenized_datasets, 'test')"

In [None]:
import numpy as np
def preprocess_logits_for_metrics(logits, labels):
    """
    Preprocess logits for evaluation and calculate the batch loss.
    """
    # Move logits and labels to the correct device (if not already)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logits = logits.to(device).float()
    labels = labels.to(device).long()

    # Shift logits and labels for causal language modeling
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    # Compute the batch loss using CrossEntropyLoss
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    return loss

def compute_metrics(eval_preds):
    """
    Compute evaluation metrics based on the cumulative loss.
    """
    # Unpack eval_preds which contains losses (possibly numpy arrays)
    losses = eval_preds[0]  # Assuming eval_preds is a tuple of (losses, ...)
    
    # Convert losses to a PyTorch tensor if they are in numpy format
    if isinstance(losses, np.ndarray):
        losses = torch.tensor(losses, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # Calculate the cumulative loss and perplexity
    total_loss = torch.sum(losses)  # Sum of all batch losses
    perplexity = torch.exp(total_loss / len(losses))  # Perplexity: exp(loss / num_batches)

    return {
        "eval_loss": total_loss.item()/ len(losses),
        "perplexity": perplexity.item(),
    }

In [None]:
# set up trainer
training_args = TrainingArguments(**training_args_dict)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics   
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Perplexity
10,3.1592,3.148452,23.285486
20,2.9351,3.145749,23.222681


TrainOutput(global_step=20, training_loss=3.047170639038086, metrics={'train_runtime': 718.0093, 'train_samples_per_second': 0.223, 'train_steps_per_second': 0.028, 'total_flos': 3496706687631360.0, 'train_loss': 3.047170639038086, 'epoch': 0.0019807863721897595})

In [None]:
import json
print(trainer.state.log_history)
print(json.dumps(trainer.state.log_history, indent = 4))

[{'loss': 3.1592, 'grad_norm': 0.399615079164505, 'learning_rate': 2.5e-05, 'epoch': 0.0009903931860948797, 'step': 10}, {'eval_loss': 3.148451566696167, 'eval_perplexity': 23.285486221313477, 'eval_runtime': 337.895, 'eval_samples_per_second': 10.243, 'eval_steps_per_second': 2.563, 'epoch': 0.0009903931860948797, 'step': 10}, {'loss': 2.9351, 'grad_norm': 0.3370755910873413, 'learning_rate': 0.0, 'epoch': 0.0019807863721897595, 'step': 20}, {'eval_loss': 3.14574933052063, 'eval_perplexity': 23.222681045532227, 'eval_runtime': 339.2756, 'eval_samples_per_second': 10.201, 'eval_steps_per_second': 2.552, 'epoch': 0.0019807863721897595, 'step': 20}, {'train_runtime': 718.0093, 'train_samples_per_second': 0.223, 'train_steps_per_second': 0.028, 'total_flos': 3496706687631360.0, 'train_loss': 3.047170639038086, 'epoch': 0.0019807863721897595, 'step': 20}]
[
    {
        "loss": 3.1592,
        "grad_norm": 0.399615079164505,
        "learning_rate": 2.5e-05,
        "epoch": 0.00099039318