In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load dataset
data = pd.read_csv("../../data/Cleaned Data/CNBC_comments_clean.csv")  
comments = data["comment"].astype(str)
comments = data["comment"].astype(str).sample(frac=0.1, random_state=42)

train_comments, test_comments = train_test_split(comments, test_size=0.3, random_state=42)
val_comments, test_comments = train_test_split(test_comments, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_comments}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": val_comments}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_comments}))

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})




In [3]:
# Load tokenizer and model
model_path = "/home/puschb/UVA/NLP/NLP_Final_Political_Bias_Shifts/src/Local Models/meta-llama_Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # maybe this instead?: tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.59s/it]


In [4]:
# Tokenize each split and remove the 'text' column
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Add 'labels' field for causal language modeling
tokenized_datasets = tokenized_datasets.map(lambda examples: {"labels": examples["input_ids"]})

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/8838 [00:00<?, ? examples/s]

Map: 100%|██████████| 8838/8838 [00:01<00:00, 7831.96 examples/s]
Map: 100%|██████████| 1894/1894 [00:00<00:00, 11831.55 examples/s]
Map: 100%|██████████| 1895/1895 [00:00<00:00, 11895.73 examples/s]
Map: 100%|██████████| 8838/8838 [00:01<00:00, 5813.23 examples/s]
Map: 100%|██████████| 1894/1894 [00:00<00:00, 5981.37 examples/s]
Map: 100%|██████████| 1895/1895 [00:00<00:00, 6734.81 examples/s]


In [5]:
"""# Tokenize data
train_encodings = tokenizer(train_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
train_encodings['labels'] = train_encodings['input_ids'].clone() # following this: https://huggingface.co/docs/transformers/en/tasks/language_modeling
# pad tokens shouldnt effect loss becuase it is being set to eos toekn
val_encodings = tokenizer(val_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings['labels'] = val_encodings['input_ids'].clone()
test_encodings = tokenizer(test_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings['labels'] = test_encodings['input_ids'].clone()

print(f"input_ids shape: {train_encodings['input_ids'].shape}")
print(f"attention_mask shape: {train_encodings['attention_mask'].shape}")
print(f"labels shape: {train_encodings['labels'].shape if 'labels' in train_encodings else 'No labels found'}")
"""

'# Tokenize data\ntrain_encodings = tokenizer(train_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\ntrain_encodings[\'labels\'] = train_encodings[\'input_ids\'].clone() # following this: https://huggingface.co/docs/transformers/en/tasks/language_modeling\n# pad tokens shouldnt effect loss becuase it is being set to eos toekn\nval_encodings = tokenizer(val_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\nval_encodings[\'labels\'] = val_encodings[\'input_ids\'].clone()\ntest_encodings = tokenizer(test_comments, truncation=True, padding=True, max_length=128, return_tensors="pt")\ntest_encodings[\'labels\'] = test_encodings[\'input_ids\'].clone()\n\nprint(f"input_ids shape: {train_encodings[\'input_ids\'].shape}")\nprint(f"attention_mask shape: {train_encodings[\'attention_mask\'].shape}")\nprint(f"labels shape: {train_encodings[\'labels\'].shape if \'labels\' in train_encodings else \'No labels found\'}")\n'

In [6]:
# Set up LoRA configuration
lora_config = LoraConfig(
    r=16,   # Rank
    lora_alpha=32,
    task_type=TaskType.CAUSAL_LM,
    lora_dropout=0.1,
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Define training arguments with evaluation and checkpointing every 1000 steps
training_args = TrainingArguments(
    output_dir="../../fine_tuned_llms/llama_3_2_3b/checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,                      # Evaluate every 1000 steps
    save_steps=1000,                      # Save checkpoint every 1000 steps
    load_best_model_at_end=True,          # Load the best model at the end
    metric_for_best_model="eval_accuracy", # Use validation accuracy for selecting the best model
    greater_is_better=True,                # Higher accuracy indicates a better model
    logging_dir="../../fine_tuned_llms/llama_3_2_3b/metrics",
    fp16=True
)

# perplexity approximation
def compute_perplexity(eval_preds):
    logits, labels = eval_preds
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = torch.exp(loss)
    return {"perplexity": perplexity.item()}



In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator
)

# Finetune model
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate(tokenized_datasets["test"])
print(f"Perplexity on test set: {eval_results['eval_perplexity']}")

In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5_category", split="train[:5000]")


Downloading data: 100%|██████████| 62.3M/62.3M [00:01<00:00, 32.4MB/s]
Downloading data: 100%|██████████| 5.00M/5.00M [00:00<00:00, 33.1MB/s]
Downloading data: 100%|██████████| 1.76M/1.76M [00:00<00:00, 36.2MB/s]
Downloading data: 100%|██████████| 3.85M/3.85M [00:00<00:00, 35.3MB/s]
Generating train split: 100%|██████████| 91772/91772 [00:07<00:00, 11999.20 examples/s]
Generating validation1 split: 100%|██████████| 5446/5446 [00:00<00:00, 9646.19 examples/s] 
Generating validation2 split: 100%|██████████| 2375/2375 [00:00<00:00, 12216.79 examples/s]
Generating test split: 100%|██████████| 5411/5411 [00:00<00:00, 10407.36 examples/s]


AttributeError: 'Dataset' object has no attribute 'keys'

In [None]:
eli5 = eli5.train_test_split(test_size=0.2)

<class 'datasets.arrow_dataset.Dataset'>
