In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0).total_memory)

NVIDIA A40
47608692736


In [3]:
#load dataset
data = pd.read_csv("../../data/Cleaned Data/CNBC_comments_clean.csv")  
comments = data["comment"].astype(str).sample(frac=0.1, random_state=42)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=42)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})




In [4]:
# Load tokenizer and model
model_path = "../Local Models/meta-llama_Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # maybe this instead?: tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)



Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.83s/it]


In [5]:
# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for causal language modeling
tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["input_ids"]})

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|██████████| 8838/8838 [00:00<00:00, 17096.21 examples/s]
Map: 100%|██████████| 1894/1894 [00:00<00:00, 18697.51 examples/s]
Map: 100%|██████████| 1895/1895 [00:00<00:00, 17829.52 examples/s]
Map: 100%|██████████| 8838/8838 [00:00<00:00, 15521.69 examples/s]
Map: 100%|██████████| 1894/1894 [00:00<00:00, 15143.25 examples/s]
Map: 100%|██████████| 1895/1895 [00:00<00:00, 15601.27 examples/s]


In [6]:
"""# Tokenize data
train_encodings = tokenizer(train_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
train_encodings['labels'] = train_encodings['input_ids'].clone() # following this: https://huggingface.co/docs/transformers/en/tasks/language_modeling
# pad tokens shouldnt effect loss becuase it is being set to eos toekn
val_encodings = tokenizer(val_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings['labels'] = val_encodings['input_ids'].clone()
test_encodings = tokenizer(test_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings['labels'] = test_encodings['input_ids'].clone()

print(f"input_ids shape: {train_encodings['input_ids'].shape}")
print(f"attention_mask shape: {train_encodings['attention_mask'].shape}")
print(f"labels shape: {train_encodings['labels'].shape if 'labels' in train_encodings else 'No labels found'}")
"""

'# Tokenize data\ntrain_encodings = tokenizer(train_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\ntrain_encodings[\'labels\'] = train_encodings[\'input_ids\'].clone() # following this: https://huggingface.co/docs/transformers/en/tasks/language_modeling\n# pad tokens shouldnt effect loss becuase it is being set to eos toekn\nval_encodings = tokenizer(val_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\nval_encodings[\'labels\'] = val_encodings[\'input_ids\'].clone()\ntest_encodings = tokenizer(test_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\ntest_encodings[\'labels\'] = test_encodings[\'input_ids\'].clone()\n\nprint(f"input_ids shape: {train_encodings[\'input_ids\'].shape}")\nprint(f"attention_mask shape: {train_encodings[\'attention_mask\'].shape}")\nprint(f"labels shape: {train_encodings[\'labels\'].shape if \'labels\' in train_encodings else \'No labels found\'}")\n'

In [None]:
# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,   # Rank
    lora_alpha=32,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Define training arguments with evaluation and checkpointing every 150 steps
training_args = TrainingArguments(
    output_dir="../../fine_tuned_llms/llama_3_2_3b/checkpoints",
    per_device_train_batch_size=32, #using A40 gpu, not sure if rivanna can handle more, sticking with this for now
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", #"perplexity",   # Select the best model based on perplexity
    greater_is_better=False,              # Lower perplexity is better
    logging_dir="../../fine_tuned_llms/llama_3_2_3b/metrics",
    fp16=True,
    save_total_limit=3, # only keeping best 3
)


#potential issue:
# The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.


# perplexity approximation
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    
    # Calculate loss
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    
    # Calculate perplexity
    perplexity = torch.exp(loss)
    
    return {"eval_loss": loss.item(), "perplexity": perplexity.item()}



In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    #compute_metrics=compute_metrics  # Pass the compute_metrics function
    # this is causing massive memory usage because it accumulates all of the tensors before evaluating them (dumb!), there is a fix here that we have to implement
    #https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/12 - morenolq's answer
)

# Finetune model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
50,No log,3.453657
100,No log,3.381265
150,No log,3.358906
200,No log,3.349973
250,No log,3.344757
300,No log,3.341569
350,No log,3.338951
400,No log,3.337027
450,No log,3.335171
500,3.305500,3.333374


TrainOutput(global_step=831, training_loss=3.283893534709687, metrics={'train_runtime': 1375.0364, 'train_samples_per_second': 19.282, 'train_steps_per_second': 0.604, 'total_flos': 5.749086862599782e+16, 'train_loss': 3.283893534709687, 'epoch': 3.0})

In [11]:
eval_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Loss on test set: {eval_results['eval_loss']}")

Loss on test set: 3.335228443145752


In [1]:
from datasets import load_dataset

eli5 = load_dataset("eli5_category", split="train[:5000]")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
eli5 = eli5.train_test_split(test_size=0.2)

<class 'datasets.arrow_dataset.Dataset'>
