In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from fine_tune_util import EvalSampleDatasetTrainer, compute_metrics, preprocess_logits_for_metrics_mlm, token_length_histogram, save_dicts_to_csv, save_metrics
from datetime import datetime
import optuna
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# This should be the only part of this code that is getting modified                      #
model_name = "FacebookAI_roberta-large" 
model_path = f"../Local Models/{model_name}"                                              #                                                      
                                                                                          #
from_pretrained_params_dict = {                                                           #
    "pretrained_model_name_or_path" : model_path,                                                           #
    "device_map":"cuda:0",                                                                  #
    #"torch_dtype": torch.float16                                                          #
}                                                                                         #
                                                                                          #
'''lora_config_params_dict = {                                                               #
    "lora_alpha":1024,                                                                      #
    "lora_dropout":0.1,                                                                   #
    "r":512,                                                                               #
    "bias":"none",                                                                        #
    "task_type":TaskType.,                                                       #
}     '''                                                                                    #
                                                                                          #
quantization_params_dict = { }                                                                                         #
                                                                                          #
tokenizer_params_dict = {                                                                 #
  "truncation":True,
  "padding": True,
  "max_length":384                                                                       #
}                                                                                         #
                                                                                          #
cur_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")                               #
checkpoint_dir = f"../../fine_tuned_llms/{model_name}/checkpoints/{cur_datetime}"        #
metrics_dir = f"{checkpoint_dir}/metrics.json"                                              #                                                                                          
                                                                                          #
training_args_dict = {                                                                    #                 
  "output_dir":checkpoint_dir,                                                            #                              
  "per_device_train_batch_size":96, # using A100 gpu, not sure if rivanna can handle more,  #                                                             
  "per_device_eval_batch_size":96,                                  
  "num_train_epochs":1,                
  "evaluation_strategy":"no",                                                          #                                
  "save_strategy":"no",                                                                    #                                
  "metric_for_best_model":"perplexity",                                                                                       
  "greater_is_better":False,                                                        #                            
  "fp16":True, 
  "learning_rate": 1e-4,    
  "lr_scheduler_type":'constant',
  "warmup_steps": 50,  
  "optim" : "adamw_torch",
  "weight_decay" : 0,                                         #                                            
}                                                                                         # 
###########################################################################################

In [3]:
#load dataset

seed = 210

data = pd.read_csv("../../data/Cleaned Data/CNN_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.1, random_state=seed)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=seed)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=seed)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})



In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = True)
def tokenize_function(examples):
    return tokenizer(examples["text"], **tokenizer_params_dict) # we don't do padding here, we let the data collater handle it

# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for language modeling
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability = .15)

Map: 100%|██████████| 114703/114703 [00:32<00:00, 3571.64 examples/s]
Map: 100%|██████████| 24579/24579 [00:06<00:00, 3640.94 examples/s]
Map: 100%|██████████| 24580/24580 [00:06<00:00, 3556.61 examples/s]


In [5]:
def tune_hyperparameters():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    return study

def objective(trial):
    lr = trial.suggest_uniform('lr',1e-7, 1e-4)
    weight_decay = trial.suggest_uniform('weight_decay',0, .5)
    optim = trial.suggest_categorical('optim', ["adamw_torch", "sgd", "rmsprop"])

    training_args_dict['learning_rate'] = lr
    training_args_dict['weight_decay'] = weight_decay
    training_args_dict['optim'] = optim
    

    
    model = AutoModelForMaskedLM.from_pretrained(**from_pretrained_params_dict)

    # set up trainer
    training_args = TrainingArguments(**training_args_dict)
    trainer = EvalSampleDatasetTrainer(
    eval_sample_size_proportion = .25,
    seed = seed,
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics_mlm,
    )

    trainer.train()
    perplexity = trainer.evaluate()['eval_perplexity']
    del model, training_args, trainer
    gc.collect()
    torch.cuda.empty_cache()
    return perplexity
    

In [6]:
study = tune_hyperparameters()
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.to_csv('/home/bhx5gh/Documents/NLP/NLP_Final_Political_Bias_Shifts/src/LLMs/bert_hyper_param_tuning_results.csv')

[I 2024-12-15 01:53:10,669] A new study created in memory with name: no-name-510064f4-e559-427b-8b0a-3691cbde4af4
  lr = trial.suggest_uniform('lr',1e-7, 1e-4)
  weight_decay = trial.suggest_uniform('weight_decay',0, .5)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[W 2024-12-15 01:53:15

OutOfMemoryError: CUDA out of memory. Tried to allocate 144.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 87.81 MiB is free. Including non-PyTorch memory, this process has 39.17 GiB memory in use. Of the allocated memory 38.63 GiB is allocated by PyTorch, and 49.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)