In [1]:
import optuna
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict
from fine_tune_util import EvalSampleDatasetTrainer, compute_metrics, preprocess_logits_for_metrics, token_length_histogram, save_dicts_to_csv, save_metrics
from datetime import datetime
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# This should be the only part of this code that is getting modified
model_name = "mistralai_Mistral-7B-v0_1" #"meta-llama_Llama-3.2-3B" #       'meta-llama_Llama-3.2-1B'#                                       #
model_path = f"../Local Models/{model_name}"
from_pretrained_params_dict = {
    "pretrained_model_name_or_path" : model_path,
    "device_map":"auto",
    "torch_dtype": torch.float16
    }

lora_config_params_dict = {
    "lora_alpha":16,
    "lora_dropout":0.1,
    "r":64,
    "bias":"none",
    "task_type":TaskType.CAUSAL_LM,
    }

quantization_params_dict = {}

tokenizer_params_dict = {
  "truncation":True,
  "padding": True,
  "max_length":384
  }

cur_datetime = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
checkpoint_dir = f"../../fine_tuned_llms/{model_name}/checkpoints/{cur_datetime}"
metrics_dir = f"{checkpoint_dir}/metrics.json"

training_args_dict = {                 
  "output_dir":checkpoint_dir,
  "per_device_train_batch_size":24,
  "per_device_eval_batch_size":32,
  "max_steps": 1000,
  "evaluation_strategy":"no",
  "save_strategy":"no",
  "load_best_model_at_end":True,
  "metric_for_best_model":"perplexity",
  "greater_is_better":False,
  "fp16":True,
  "learning_rate":  1e-4,
  "lr_scheduler_type":'linear',
  "warmup_steps" : 100,

  }

In [3]:
#load dataset
seed = 210

data = pd.read_csv("../../data/Cleaned Data/CNN_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.1, random_state=seed)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=seed)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=seed)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast = True)
tokenizer.pad_token = tokenizer.eos_token 
tokenizer.padding_side = "right"
def tokenize_function(examples):
    return tokenizer(examples["text"], **tokenizer_params_dict) 

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets = tokenized_datasets.remove_columns(["__index_level_0__"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 115390/115390 [00:23<00:00, 4871.58 examples/s]
Map: 100%|██████████| 24726/24726 [00:04<00:00, 5252.64 examples/s]
Map: 100%|██████████| 24727/24727 [00:04<00:00, 5758.84 examples/s]


In [5]:
def tune_hyperparameters():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=15)
    return study

def objective(trial):
    lr = trial.suggest_uniform('lr',1e-7, 5e-5)
    r = trial.suggest_categorical('r', [32,64,128])
    lora_alpha = trial.suggest_categorical('lora_alpha', [8,16,32])

    lora_config_params_dict['lora_alpha'] = lora_alpha
    lora_config_params_dict['r'] = r
    training_args_dict['learning_rate'] = lr
    
    model = AutoModelForCausalLM.from_pretrained(**from_pretrained_params_dict)
    peft_config =  LoraConfig(**lora_config_params_dict)
    model = get_peft_model(model, peft_config)

    training_args = TrainingArguments(**training_args_dict)
    trainer = EvalSampleDatasetTrainer(
        eval_sample_size_proportion = .25,
        seed = seed,
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    trainer.train()
    perplexity = trainer.evaluate()['eval_perplexity']
    del model, peft_config, training_args, trainer
    gc.collect()
    torch.cuda.empty_cache()
    return perplexity
    

In [6]:
study = tune_hyperparameters()
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
df.to_csv('/home/bhx5gh/Documents/NLP/NLP_Final_Political_Bias_Shifts/src/LLMs/mistral_hyper_param_tuning_results.csv')

[I 2024-12-14 13:23:32,267] A new study created in memory with name: no-name-4fd284b7-aad7-4a33-89a9-d3dc2f75d2f4
  lr = trial.suggest_uniform('lr',1e-7, 5e-5)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.37s/it]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps 

Step,Training Loss
