# Emotion Classification Model Comparison

This notebook compares the performance of three popular transformer models on the Emotion dataset:
1. DistilBERT (`distilbert-base-uncased`)
2. BERT (`bert-base-uncased`)
3. RoBERTa (`roberta-base`)

We will fine-tune each model and evaluate their Accuracy and F1 Score.

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt
import os

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### Load Dataset
We use the `dair-ai/emotion` dataset from Hugging Face.

In [None]:
emotions = load_dataset("dair-ai/emotion")
print(emotions)

### Define Metrics and Training Loop
We define a helper function to compute metrics (Accuracy & F1) and a main function to tokenize, train, and evaluate a given model.

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

def train_and_evaluate(model_ckpt):
    print(f"\n{'='*30}")
    print(f"Processing model: {model_ckpt}")
    print(f"{'='*30}")
    
    # 1. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    
    def tokenize(batch):
        return tokenizer(batch['text'], padding=True, truncation=True)
    
    print(f"Tokenizing data for {model_ckpt}...")
    emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
    
    # 2. Model Initialization
    num_labels = 6
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)
    
    # 3. Training Arguments
    model_output_dir = f"{model_ckpt}-emotion-finetuned"
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=2,
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_dir=f'{model_output_dir}/logs',
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    
    # 4. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=emotions_encoded['train'],
        eval_dataset=emotions_encoded['validation'],
        processing_class=tokenizer
    )
    
    # 5. Train
    print(f"Training {model_ckpt}...")
    trainer.train()
    
    # 6. Evaluate on Test Set
    print(f"Evaluating {model_ckpt} on test set...")
    eval_result = trainer.evaluate(emotions_encoded['test'])
    
    return {
        "Model": model_ckpt,
        "Accuracy": eval_result['eval_accuracy'],
        "F1 Score": eval_result['eval_f1'],
        "Loss": eval_result['eval_loss']
    }

### Run Comparison
We iterate through the list of models, training and evaluating each one.

In [None]:
models = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base"
]

results = []

for model_name in models:
    res = train_and_evaluate(model_name)
    results.append(res)
    
    # Checkpoint results
    df_temp = pd.DataFrame(results)
    print(f"\nPartial Results:\n{df_temp}")

### Final Results Visualization

In [None]:
df_final = pd.DataFrame(results)
print("Final Comparison:")
print(df_final)

# Plotting
df_final.plot(x="Model", y=["Accuracy", "F1 Score"], kind="bar", figsize=(10, 6), rot=0)
plt.title("Model Comparison on Emotion Classification")
plt.ylabel("Score")
plt.ylim(0.8, 1.0)  # Adjust ylim based on expected performance
plt.legend(loc='lower right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()