In [None]:
# Install required libraries
!pip install -q transformers datasets trl scikit-learn torch

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report
import os
from trl import DPOConfig, DPOTrainer

In [None]:
# Print dataset information
print("Loading SemEval 2018 Task 1 Dataset (Subtask 5 - English)")
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

# Print dataset sizes
print("\nDataset Sizes:")
print(f"Train set: {len(dataset['train'])} samples")
print(f"Validation set: {len(dataset['validation'])} samples")
print(f"Test set: {len(dataset['test'])} samples")

Loading SemEval 2018 Task 1 Dataset (Subtask 5 - English)

Dataset Sizes:
Train set: 6838 samples
Validation set: 886 samples
Test set: 3259 samples


In [None]:
# Prepare labels
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Print label information
print("\nLabels:")
for idx, label in id2label.items():
    print(f"{idx}: {label}")


Labels:
0: anger
1: anticipation
2: disgust
3: fear
4: joy
5: love
6: optimism
7: pessimism
8: sadness
9: surprise
10: trust


In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [None]:
# Prepare dataset for DPO
def prepare_dpo_dataset(examples):
    # Convert multi-label to preference pairs
    tweets = examples['Tweet']
    label_columns = [examples[label] for label in labels]

    preferred_samples = []
    rejected_samples = []

    for tweet, *label_values in zip(tweets, *label_columns):
        # Find the labels present in this sample
        present_labels = [labels[i] for i, val in enumerate(label_values) if val == 1]
        absent_labels = [labels[i] for i, val in enumerate(label_values) if val == 0]

        if present_labels and absent_labels:
            # Create preference pairs
            preferred_text = f"{tweet} Emotions: {', '.join(present_labels)}"
            rejected_text = f"{tweet} Emotions: {', '.join(absent_labels)}"

            preferred_samples.append(preferred_text)
            rejected_samples.append(rejected_text)

    return {
        'prompt': tweets,
        'chosen': preferred_samples,
        'rejected': rejected_samples
    }

In [None]:
# Prepare encoded dataset
print("\nPreprocessing dataset...")
encoded_dataset = dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=dataset['train'].column_names,
    batch_size=16  # Ensure consistent batch processing
)
encoded_dataset.set_format("torch")


Preprocessing dataset...


Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

In [None]:
# Disable Wandb
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Prepare the dataset
processed_dataset = dataset['train'].map(prepare_dpo_dataset, batched=True, remove_columns=dataset['train'].column_names)

# DPO Training Configuration
training_args = DPOConfig(
    output_dir="./emotion_classification_dpo_results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none"
)

# DPO Trainer
trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=tokenizer
)

# Print training start message
print("\n--- Starting DPO Training ---")
print(f"Training on {len(processed_dataset)} samples")

# Train the model
training_output = trainer.train()

# Print training results
print("\n--- Training Complete ---")
print("Training Duration:", training_output.metrics.get('train_runtime', 'N/A'))

# Inference example
print("\n--- Model Inference ---")
test_texts = [
    "I'm feeling really excited and happy about this new project!",
    "I'm worried about the upcoming deadline and feeling stressed.",
    "This is the most amazing day ever, I'm overjoyed!"
]

for text in test_texts:
    print(f"\nTest Sentence: {text}")

    # Prepare input
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Process predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = (probs >= 0.5).numpy()

    # Print results
    print("Predicted Emotions:")
    for idx, pred in enumerate(predictions):
        if pred:
            print(f"{id2label[idx]}: {probs[idx].item():.4f}")

Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

ArrowInvalid: Column 1 named chosen expected length 1000 but got length 971