# Loading the dataset

We will use the Datasets library to download the data and get the metric we need to use for further evaluation between models.

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

Set a seed to ensure reproducibility.

In [5]:
seed = 306
torch.manual_seed(seed)
np.random.seed(seed)

We set the default device to CPU so that all models are created on the CPU instead of the MPS/GPU backend.

In [7]:
torch.set_default_device("cpu")

Load the split dair-ai/emotion dataset, which has a total of 20,000 examples split into train, validation and split sets.

In [9]:
dataset = load_dataset("dair-ai/emotion", "split")

We can access the element with its split and index.

In [11]:
print(dataset["train"][0])

{'text': 'i didnt feel humiliated', 'label': 0}


Extract the names of the six emotion labels.

In [13]:
emotion_labels = dataset["train"].features["label"].names
print("Label names:", emotion_labels)

Label names: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


# Preprocessing the data

Set our model checkpoint and load its tokenizer. Then, define a tokenize_function and apply it to the whole dataset in batched mode for efficiency. Dynamic padding pads each batch only to the length of its longest sequence, making training faster and more efficient than padding everything to a fixed maximum length. Because 99% of texts have fewer than 57 tokens, using a max length of 64 is sufficient and avoids unnecessary memory usage.

In [16]:
checkpoint = "prajjwal1/bert-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=64)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [17]:
lengths = []

for text in dataset["train"]["text"]:
    tokens = tokenizer(text, truncation=False, add_special_tokens=True)
    lengths.append(len(tokens["input_ids"]))

print("Max length:", np.max(lengths))
print("Mean length:", np.mean(lengths))
print("99th percentile:", np.percentile(lengths, 99))

Max length: 87
Mean length: 22.2595
99th percentile: 57.0


In [18]:
print(tokenized_datasets["train"].column_names)

['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


We remove the raw text column, rename the label column to â€˜labelsâ€™, and set the dataset format to PyTorch tensors to ensure compatibility with the Trainer API and maintain a consistent input structure across all models.

In [20]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

print(tokenized_datasets["train"].column_names)

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


# Fine-tuning the model

We can now use the preprocessed data to fine-tune the model. We use the AutoModelForSequenceClassification class for emotion classifications. There are 6 labels because the emotion dataset has six emotion categories.

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=6)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define a compute_metrics function that converts the model logits into predicted class labels using the argmax operation, compares these predictions with the true labels, and returns evaluation metrics such as accuracy, macro precision, macro recall, and macro F1-score. We use macro averaging treats all classes equally, because our emotion dataset is imbalanced.

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="macro", zero_division=0)

    return {
        "accuracy": accuracy,
        "macro_precision": precision,
        "macro_recall": recall,
        "macro_f1": f1
    }

Set up the TrainingArguments to specify where model checkpoints are saved and evaluation should be made at the end of each epoch. We also set the batch size to 8 to ensure stable training on a CPU-based environment, which can be adjusted later. Since the last model in the training might not be the best one, we let the Trainer load the best model at the end. 

In [27]:
model_name = checkpoint.split("/")[-1]
batch_size = 8

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned", # Where to save checkpoints
    eval_strategy="epoch",                # Evaluate at the end of each epoch
    save_strategy="epoch",                # Save a checkpoint each epoch
    learning_rate=2e-5,                   # A standard fine-tuning learning rate
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,                  
    weight_decay=0.01,                    # Slight regularization
    load_best_model_at_end=True,          # Restore best model (on validation set)
    metric_for_best_model="macro_f1",     # Use macro F1 to select the best checkpoint
    greater_is_better=True,               # Higher macro F1 is better
    use_cpu=True,
)



Pass the model, training arguments, training and test datasets, and evaluation function to the trainer we have created. Then we can call train() to start training.

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator, 
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro Precision,Macro Recall,Macro F1
1,0.3203,0.252904,0.916,0.877808,0.905361,0.889635
2,0.2311,0.202321,0.9315,0.90066,0.909096,0.904687
3,0.1592,0.208344,0.9355,0.910749,0.90916,0.90915


TrainOutput(global_step=6000, training_loss=0.32123424784342447, metrics={'train_runtime': 565.4055, 'train_samples_per_second': 84.895, 'train_steps_per_second': 10.612, 'total_flos': 152955539229312.0, 'train_loss': 0.32123424784342447, 'epoch': 3.0})

In [30]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.24242918193340302,
 'eval_accuracy': 0.9205,
 'eval_macro_precision': 0.872061240911096,
 'eval_macro_recall': 0.8696265739494918,
 'eval_macro_f1': 0.8703872097839622,
 'eval_runtime': 4.9691,
 'eval_samples_per_second': 402.483,
 'eval_steps_per_second': 50.31,
 'epoch': 3.0}

# Evaluation and metrics

Compute per-class F1-scores to identify which emotions are particularly challenging for smaller models.

In [32]:
pred = trainer.predict(tokenized_datasets["test"])
y_true = pred.label_ids
y_pred = pred.predictions.argmax(axis=-1)

per_class_f1 = f1_score(y_true, y_pred, average=None, zero_division=0)

f1_df = pd.DataFrame({
    "Emotion": emotion_labels,
    "F1-score": per_class_f1
})
f1_df = f1_df.sort_values("F1-score", ascending=False).reset_index(drop=True)
f1_df

Draw a confusion matrix heatmap using seaborn.

In [34]:
cm = confusion_matrix(y_true, y_pred)
cm_df = pd.DataFrame(cm, index=emotion_labels, columns=emotion_labels)

plt.figure(figsize=(8,6))
sns.heatmap(
    cm_df, 
    annot=True, 
    fmt="d", 
    cmap="Oranges",
    linewidths=.5, 
    cbar=True
)
plt.title("Confusion Matrix for BERT-Tiny Model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.savefig("confusion_matrix_berttiny.png", dpi=300, bbox_inches="tight")
plt.show()