In [None]:
!pip install transformers datasets accelerate pandas numpy evaluate

In [16]:
import pandas as pd
import numpy as np
import os
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
from google.colab import drive
import evaluate

def parse_emotion_file(file_path):
    """
    Parses a text file with each line in the format: {text; emotion}
    and returns a pandas DataFrame with 'text' and 'emotion' columns.

    Args:
    - file_path (str): Path to the .txt file to be parsed

    Returns:
    - df (pd.DataFrame): DataFrame containing 'text' and 'emotion' columns
    """
    texts = []
    emotions = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                # Split each line by the semicolon separator
                text, emotion = line.strip().split(';')

                # append text and emotion to separate lists
                texts.append(text)
                emotions.append(emotion)
            except ValueError:
                continue

    return pd.DataFrame({'text': texts, 'emotion': emotions})


In [17]:
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


GPU available: True
GPU device: Tesla T4
GPU memory: 14.74 GB


In [18]:
## Connecting to the Google Drive
from google.colab import drive
drive.mount('/content/drive')

PARENT_DIR = '/content/drive/MyDrive/MLEng/'
MODEL_DIR = os.path.join(PARENT_DIR, 'model_outputs/')
os.makedirs(MODEL_DIR, exist_ok=True)

# Update file paths
# Parse text files and store as Pandas DataFrames
train_df = parse_emotion_file(PARENT_DIR + "data/train.txt")
val_df = parse_emotion_file(PARENT_DIR + "data/val.txt")
test_df = parse_emotion_file(PARENT_DIR + "data/test.txt")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")
print(f"Emotion classes: {train_df['emotion'].unique()}")
print(f"Class distribution in training set:")
print(train_df['emotion'].value_counts())

Train set: 16000 samples
Validation set: 2000 samples
Test set: 2000 samples
Emotion classes: ['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']
Class distribution in training set:
emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64


In [20]:
# Define the model path for DistilBERT
model_name = "distilbert-base-uncased"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create label mappings
labels = sorted(train_df["emotion"].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Label to ID mapping: {label2id}")

Label to ID mapping: {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}


In [21]:
# Convert pandas DataFrames to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Define tokenization function with batching for efficiency
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128  # Reduced from 512 for better performance
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


In [23]:
# Add labels
def add_labels(examples):
    examples["labels"] = [label2id[emotion] for emotion in examples["emotion"]]
    return examples

train_dataset = train_dataset.map(add_labels, batched=True)
val_dataset = val_dataset.map(add_labels, batched=True)
test_dataset = test_dataset.map(add_labels, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    label2id=label2id,
    id2label=id2label
)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Load metric for evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [25]:
# Set up training arguments optimized for T4 GPU
training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIR, "checkpoints"),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    save_total_limit=2,  # Keep only the 2 best checkpoints to save space
    fp16=True,  # Enable mixed precision training for faster performance on T4
    gradient_accumulation_steps=2  # Accumulate gradients for effective larger batch size
)

# Initialize trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mafrologicinsect[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.224337,0.9205
2,0.401900,0.147065,0.9415
3,0.401900,0.129139,0.9425
4,0.100700,0.135578,0.941
5,0.100700,0.134378,0.939


TrainOutput(global_step=1250, training_loss=0.2147744094848633, metrics={'train_runtime': 702.5386, 'train_samples_per_second': 113.873, 'train_steps_per_second': 1.779, 'total_flos': 2649536962560000.0, 'train_loss': 0.2147744094848633, 'epoch': 5.0})

In [26]:
# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test results: {test_results}")

# Save the final model to the parent directory
final_model_path = os.path.join(MODEL_DIR, "final_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to {final_model_path}")

Test results: {'eval_loss': 0.15945003926753998, 'eval_accuracy': 0.925, 'eval_runtime': 1.6421, 'eval_samples_per_second': 1217.947, 'eval_steps_per_second': 19.487, 'epoch': 5.0}
Model saved to /content/drive/MyDrive/MLEng/model_outputs/final_model


In [27]:
# Save label mappings for inference
import json
with open(os.path.join(final_model_path, "label_mappings.json"), "w") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

# Example of how to use the model for inference
from transformers import pipeline

emotion_classifier = pipeline(
    "text-classification",
    model=final_model_path,
    tokenizer=final_model_path
)

# Test inference with a sample text
test_text = "I'm feeling very happy today!"
result = emotion_classifier(test_text)
print(f"Sample text: '{test_text}'")
print(f"Predicted emotion: {result[0]['label']}, Score: {result[0]['score']:.4f}")

Device set to use cuda:0


Sample text: 'I'm feeling very happy today!'
Predicted emotion: joy, Score: 0.9976
