In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from fine_tune_util import compute_metrics, preprocess_logits_for_metrics, token_length_histogram, save_dicts_to_csv, save_metrics
from datetime import datetime
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory)

NVIDIA A40
47608692736


In [3]:
##################################### All Configuration ###################################
# This should be the only part of this code that is getting modified                      #
model_name = "meta-llama_Llama-3.2-3B" #           "mistralai_Mistral-7B-v0_1"                                   #
model_path = f"../Local Models/{model_name}"                                              #                                                      

quantization_params_dict = {
  "load_in_4bit":True,
  "bnb_4bit_quant_type":"nf4",
  "bnb_4bit_compute_dtype":torch.float16,
  "bnb_4bit_use_double_quant":True,
  "bnb_4bit_quant_storage":torch.float16,                                                                                                                                                    
}                                                                                           
from_pretrained_params_dict = {                                                          
    "pretrained_model_name_or_path" : model_path,                                                                  
     "device_map":"auto",                                                                 
    "torch_dtype": torch.float16  ,
    "quantization_config": BitsAndBytesConfig(**quantization_params_dict)                                                       
}                                                                                        
                                                                                         
lora_config_params_dict = {                                                              
    "lora_alpha":16,                                                                     
    "lora_dropout":0.1,                                                                  
    "r":64,                                                                              
    "bias":"none",                                                                       
    "task_type":TaskType.CAUSAL_LM,                                                      
    #"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"     #
     #               "down_proj", "lm_head"]                                               #
}                                                                                         #                                                                                       
                                                                                          
tokenizer_params_dict = {                                                                 
  "truncation":True,
  "padding": True,
  #"return_tensors": None,                                                                      #
  "max_length":384                                                                       #
}                                                                                         #
                                                                                          #
cur_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")                               #
checkpoint_dir = f"../../fine_tuned_llms/{model_name}/checkpoints/{cur_datetime}"        #
metrics_dir = f"{checkpoint_dir}/metrics.json"                                              #                                                                                          
                                                                                          #
training_args_dict = {                                                                    #                 
  "output_dir":checkpoint_dir,                                                            #                              
  "per_device_train_batch_size":16, # using A40 gpu, not sure if rivanna can handle more,  #                                                                                         
                                    # sticking with this for now                          #                                                                
  "per_device_eval_batch_size":16,                                                         #                                 
  "num_train_epochs":3,                                                                  #                        
  #"max_steps": 30,                                                                       #                   
  "evaluation_strategy":"steps",                                                          #                                
  "save_strategy":"steps",                                                                #                          
  "eval_steps":300,                                                                        #                  
  "save_steps":300,                                                                        #                  
  "load_best_model_at_end":True,                                                          #                                
  "metric_for_best_model":"perplexity",   # Select the best model based on perplexity     #                                                                                     
  "greater_is_better":False,              # Lower perplexity is better                    #                                                                      
  "logging_dir":metrics_dir,   
  "logging_strategy": "steps" , 
  "logging_steps": 10,                                                         #                            
  "fp16":True, 
  "learning_rate":  1e-4,    
  "lr_scheduler_type":'constant',
  #"adafactor": True,
  "optim": "adamw_bnb_8bit",
  "eval_on_start": True,                                                                    #              
  #save_total_limit=3, # only keeping best 3                                              #                                            
}                                                                                         # 
###########################################################################################

In [4]:
#load dataset

seed = 210

data = pd.read_csv("../../data/Cleaned Data/CNN_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.1, random_state=seed)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=seed)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=seed)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})



In [5]:
# set up model
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = True)
model = AutoModelForCausalLM.from_pretrained(**from_pretrained_params_dict)
peft_config =  LoraConfig(**lora_config_params_dict)
model = get_peft_model(model, peft_config)
print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.41s/it]


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): 

In [6]:
#set up tokenizer
tokenizer.pad_token = tokenizer.eos_token  # maybe this instead?: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# using EOS should be fine since we want to talk like youtube comments
tokenizer.padding_side = "right"
def tokenize_function(examples):
    return tokenizer(examples["text"], **tokenizer_params_dict) # we don't do padding here, we let the data collater handle it

In [7]:
# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for causal language modeling
#tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["input_ids"]})
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])
print(tokenized_datasets)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 115390/115390 [00:15<00:00, 7684.22 examples/s]
Map: 100%|██████████| 24726/24726 [00:03<00:00, 8236.70 examples/s]
Map: 100%|██████████| 24727/24727 [00:03<00:00, 7674.25 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 115390
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 24726
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 24727
    })
})





In [8]:
'''token_length_histogram(tokenized_datasets, 'train')
token_length_histogram(tokenized_datasets, 'validation')
token_length_histogram(tokenized_datasets, 'test')'''

"token_length_histogram(tokenized_datasets, 'train')\ntoken_length_histogram(tokenized_datasets, 'validation')\ntoken_length_histogram(tokenized_datasets, 'test')"

In [9]:
# set up trainer
training_args = TrainingArguments(**training_args_dict)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics   
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [10]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
save_metrics(trainer.state.log_history, f'{checkpoint_dir}/metrics.json')

hyperparams = {
    "from_pretrained_params": from_pretrained_params_dict,
    "lora_config_params":lora_config_params_dict,
    "quantization_params":quantization_params_dict,
    "tokenizer_params":tokenizer_params_dict,
    "training_args":training_args_dict
}


save_dicts_to_csv(hyperparams, '/home/bhx5gh/Documents/NLP/NLP_Final_Political_Bias_Shifts/fine_tuned_llms/mistralai_Mistral-7B-v0_1/runs.csv',
model_name, cur_datetime)
