In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import os
import time
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, Dataset
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

In [11]:
# =====================================
# Load GoEmotions Dataset (Reduced Sample)
# =====================================
print("📥 Loading GoEmotions dataset (optimized sample)...")
start_time = time.time()

dataset = load_dataset("go_emotions", "simplified")  # 6 emotions + neutral

📥 Loading GoEmotions dataset (optimized sample)...


In [12]:
# CRITICAL OPTIMIZATION: Use much smaller samples
print("Original dataset sizes:")
print(f"Train: {len(dataset['train'])}, Val: {len(dataset['validation'])}, Test: {len(dataset['test'])}")

Original dataset sizes:
Train: 43410, Val: 5426, Test: 5427


In [13]:
# Reduce dataset size dramatically for speed
TRAIN_SAMPLE = 3000  # Instead of 43k+
VAL_SAMPLE = 800     # Instead of 5k+
TEST_SAMPLE = 1000   # Instead of 5k+

In [14]:
train_df = pd.DataFrame(dataset["train"]).sample(n=TRAIN_SAMPLE, random_state=42)
val_df = pd.DataFrame(dataset["validation"]).sample(n=VAL_SAMPLE, random_state=42)
test_df = pd.DataFrame(dataset["test"]).sample(n=TEST_SAMPLE, random_state=42)

print(f"Reduced dataset sizes:")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Reduced dataset sizes:
Train: 3000, Val: 800, Test: 1000


In [15]:
# Process labels
train_df["label"] = train_df["labels"].apply(lambda x: x[0] if isinstance(x, list) else x)
val_df["label"] = val_df["labels"].apply(lambda x: x[0] if isinstance(x, list) else x)
test_df["label"] = test_df["labels"].apply(lambda x: x[0] if isinstance(x, list) else x)

In [16]:
# Encode labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["label"])
val_df["label"] = label_encoder.transform(val_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

In [17]:
# Keep only text and label columns
train_df = train_df[["text", "label"]].reset_index(drop=True)
val_df = val_df[["text", "label"]].reset_index(drop=True)
test_df = test_df[["text", "label"]].reset_index(drop=True)

print("Dataset prepared. Classes:", list(label_encoder.classes_))
print(f"Data loading time: {time.time() - start_time:.2f} seconds")

Dataset prepared. Classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27)]
Data loading time: 222.63 seconds


In [18]:
# =====================================
# Tokenization (Optimized)
# =====================================
print("Tokenizing texts...")
tokenize_start = time.time()

save_path = "./fast_emotion_model"
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# OPTIMIZATION: Shorter max_length for speed
MAX_LENGTH = 64  # Instead of 128 or 512

def tokenize_batch(batch):
    return tokenizer(
        batch["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LENGTH
    )

Tokenizing texts...


In [19]:
# Convert to HuggingFace datasets and tokenize
train_dataset = Dataset.from_pandas(train_df).map(tokenize_batch, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_batch, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_batch, batched=True)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
# Rename label column for trainer
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [21]:
# Set format for PyTorch
columns_to_keep = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format("torch", columns=columns_to_keep)
val_dataset.set_format("torch", columns=columns_to_keep)
test_dataset.set_format("torch", columns=columns_to_keep)

print(f"Tokenization complete. Time: {time.time() - tokenize_start:.2f} seconds")

Tokenization complete. Time: 47.48 seconds


In [22]:
# =====================================
# Model Setup and Optimization
# =====================================
print("Setting up model...")
model_start = time.time()

Setting up model...


In [23]:
# Check if model already exists
if os.path.exists(save_path):
    print("📂 Loading existing model from disk...")
    model = DistilBertForSequenceClassification.from_pretrained(save_path)
    tokenizer = DistilBertTokenizer.from_pretrained(save_path)
    print("Model loaded from disk!")
else:
    print("Training new model...")
    
    # Load pre-trained model
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", 
        num_labels=num_labels
    )
    
    # CRITICAL OPTIMIZATION: Freeze early layers for faster training
    print("Freezing early layers for speed...")

🚀 Training new model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


❄️ Freezing early layers for speed...


In [26]:
# Freeze embeddings
for param in model.distilbert.embeddings.parameters():
    param.requires_grad = False
    
    # Freeze first 3 transformer layers (out of 6)
    for i in range(3):
        for param in model.distilbert.transformer.layer[i].parameters():
            param.requires_grad = False
    
    # Count trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.1f}%)")
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Total parameters: 66,975,004
Trainable parameters: 22,270,492 (33.3%)
Total parameters: 66,975,004
Trainable parameters: 21,877,276 (32.7%)
Total parameters: 66,975,004
Trainable parameters: 21,876,508 (32.7%)
Total parameters: 66,975,004
Trainable parameters: 21,875,740 (32.7%)


In [34]:
 # Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


# OPTIMIZED TRAINING ARGUMENTS for speed
training_args = TrainingArguments(
    output_dir="./fast_results",
    eval_strategy="epoch",
    save_strategy="no",  # Don't save intermediate checkpoints
    learning_rate=5e-5,  # Higher learning rate for faster convergence
    per_device_train_batch_size=32,  # Larger batch size for speed
    per_device_eval_batch_size=64,   # Even larger for evaluation
    num_train_epochs=2,  # Fewer epochs
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=False,
    dataloader_num_workers=0,  # Important: 0 for CPU
    remove_unused_columns=True,
    push_to_hub=False,
    report_to=None,  # Disable wandb/tensorboard
)


# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


# Start training
print("Starting training...")
train_start = time.time()
trainer.train()
train_time = time.time() - train_start
print(f"Training completed in {train_time/60:.1f} minutes!")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,2.8318,2.418839,0.37375
2,2.2061,2.256636,0.39


Training completed in 21.2 minutes!
