# 1. Text Emotion Training - RoBERTa on GoEmotions

Train RoBERTa for text emotion classification. Run in Google Colab with GPU.

In [None]:
!pip install transformers datasets accelerate -q

In [None]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

In [None]:
# Load GoEmotions
dataset = load_dataset('go_emotions', 'simplified')
EMOTION_LABELS = ['admiration','amusement','anger','annoyance','approval','caring','confusion','curiosity','desire','disappointment','disapproval','disgust','embarrassment','excitement','fear','gratitude','grief','joy','love','nervousness','optimism','pride','realization','relief','remorse','sadness','surprise','neutral']

# Convert multi-label to single-label (use first label)
def preprocess(ex):
    ex['label'] = ex['labels'][0] if ex['labels'] else 27
    return ex

dataset = dataset.map(preprocess)
print(f"Train: {len(dataset['train'])}, Val: {len(dataset['validation'])}, Test: {len(dataset['test'])}")

In [None]:
# Load tokenizer and model
MODEL = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, 
    num_labels=28,
    id2label={i: l for i, l in enumerate(EMOTION_LABELS)},
    label2id={l: i for i, l in enumerate(EMOTION_LABELS)}
)

print(f'Model loaded: {MODEL}')

In [None]:
# Tokenize dataset - DO NOT pad here, let data collator handle it
def tokenize(examples):
    return tokenizer(
        examples['text'], 
        truncation=True, 
        max_length=128
        # NO padding='max_length' here!
    )

tokenized = dataset.map(tokenize, batched=True, remove_columns=['text', 'labels', 'id'])
print('Tokenization complete!')
print(f"Columns: {tokenized['train'].column_names}")

In [None]:
# Data collator for dynamic padding (THIS FIXES THE ERROR!)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds), 
        'f1': f1_score(p.label_ids, preds, average='weighted')
    }

print('Data collator ready!')

In [None]:
# Training arguments
args = TrainingArguments(
    output_dir='./roberta_text',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=torch.cuda.is_available(),
    report_to='none'
)

# Initialize trainer with data_collator
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,  # THIS IS CRITICAL!
    compute_metrics=compute_metrics
)

print('Trainer ready!')

In [None]:
# Train the model
print('Starting training...')
trainer.train()

In [None]:
# Evaluate on test set
print('Evaluating on test set...')
results = trainer.evaluate(tokenized['test'])
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")
print(f"Test F1: {results['eval_f1']:.4f}")

In [None]:
# Save model to Google Drive
import os
SAVE_PATH = '/content/drive/MyDrive/models/roberta_text'
os.makedirs(SAVE_PATH, exist_ok=True)

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f'Model saved to {SAVE_PATH}')

In [None]:
# Quick test
from transformers import pipeline

classifier = pipeline('text-classification', model=SAVE_PATH, top_k=3)

test_texts = [
    "I'm so happy today!",
    "This is absolutely terrible.",
    "Thank you for your help!"
]

print('\nTest Predictions:')
for text in test_texts:
    result = classifier(text)
    print(f'\n"{text}"')
    for r in result[0]:
        print(f"  {r['label']}: {r['score']:.3f}")