In [1]:
# Install required packages (DeBERTa requires sentencepiece and protobuf)
%pip install transformers[sentencepiece] datasets accelerate scikit-learn protobuf

Collecting protobuf
  Downloading protobuf-6.33.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading protobuf-6.33.2-cp39-abi3-manylinux2014_x86_64.whl (323 kB)
Downloading sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.5 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: sentencepiece, protobuf
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [protobuf]1/2[0m [protobuf]
[1A[2KSuccessfully installed protobuf-6.33.2 sentencepiece-0.2.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from datasets import load_dataset
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


### Load Data and Define Labels

In [2]:
# Load the GoEmotions dataset
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

LABELS = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]
NUM_LABELS = len(LABELS)

def convert_to_df(split):
    data = dataset[split]
    rows = []
    for i in range(len(data)):
        text = data[i]['text']
        label_ids = data[i]['labels']
        label_vec = [1 if j in label_ids else 0 for j in range(NUM_LABELS)]
        rows.append([text] + label_vec)
    return pd.DataFrame(rows, columns=['text'] + LABELS)

train_df = convert_to_df('train')
val_df = convert_to_df('validation')
test_df = convert_to_df('test')

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 43410, Val: 5426, Test: 5427


### Initialize DeBERTa Tokenizer and Dataset

In [3]:
# Use DeBERTa v3 base tokenizer
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'text'])
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

BATCH_SIZE = 16
train_dataset = EmotionDataset(train_df, tokenizer)
val_dataset = EmotionDataset(val_df, tokenizer)
test_dataset = EmotionDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)



### Initialize DeBERTa Model

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model.to(device)

EPOCHS = 4  # DeBERTa often benefits from slightly longer training
LEARNING_RATE = 2e-5

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500, # Slight warmup
    num_training_steps=total_steps
)

print(f"Model {MODEL_NAME} loaded on {device}")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model microsoft/deberta-v3-base loaded on cuda


In [5]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
            
            probs = torch.sigmoid(outputs.logits)
            preds = (probs > threshold).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'predictions': all_preds,
        'labels': all_labels
    }

### Train DeBERTa

In [6]:
best_f1 = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    val_results = evaluate(model, val_loader, device)
    print(f"Validation Loss: {val_results['loss']:.4f}")
    print(f"Validation F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Validation F1 (macro): {val_results['f1_macro']:.4f}")
    
    if val_results['f1_macro'] > best_f1:
        best_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_deberta_model.pt')
        print("Saved best model!")

print(f"\nBest Validation F1 (macro): {best_f1:.4f}")


Epoch 1/4


Training: 100%|██████████| 2714/2714 [05:37<00:00,  8.03it/s]


Training Loss: 0.1484


Evaluating: 100%|██████████| 340/340 [00:16<00:00, 21.13it/s]


Validation Loss: 0.0936
Validation F1 (micro): 0.5229
Validation F1 (macro): 0.2893
Saved best model!

Epoch 2/4


Training: 100%|██████████| 2714/2714 [05:20<00:00,  8.46it/s]


Training Loss: 0.0888


Evaluating: 100%|██████████| 340/340 [00:13<00:00, 25.60it/s]


Validation Loss: 0.0852
Validation F1 (micro): 0.5766
Validation F1 (macro): 0.3938
Saved best model!

Epoch 3/4


Training: 100%|██████████| 2714/2714 [05:22<00:00,  8.42it/s]


Training Loss: 0.0783


Evaluating: 100%|██████████| 340/340 [00:15<00:00, 22.19it/s]


Validation Loss: 0.0858
Validation F1 (micro): 0.5755
Validation F1 (macro): 0.4336
Saved best model!

Epoch 4/4


Training: 100%|██████████| 2714/2714 [06:01<00:00,  7.51it/s]


Training Loss: 0.0714


Evaluating: 100%|██████████| 340/340 [00:16<00:00, 20.22it/s]


Validation Loss: 0.0850
Validation F1 (micro): 0.5880
Validation F1 (macro): 0.4568
Saved best model!

Best Validation F1 (macro): 0.4568


### Evaluate on Test Set

In [7]:
model.load_state_dict(torch.load('best_deberta_model.pt'))
test_results = evaluate(model, test_loader, device)

print("="*50)
print("DEBERTA TEST RESULTS")
print("="*50)
print(f"Test F1 (micro): {test_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {test_results['f1_macro']:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(
    test_results['labels'], 
    test_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 340/340 [00:15<00:00, 21.37it/s]


DEBERTA TEST RESULTS
Test F1 (micro): 0.5908
Test F1 (macro): 0.4559

Detailed Classification Report:
                precision    recall  f1-score   support

    admiration       0.70      0.72      0.71       504
     amusement       0.77      0.90      0.83       264
         anger       0.54      0.46      0.50       198
     annoyance       0.56      0.20      0.29       320
      approval       0.56      0.36      0.44       351
        caring       0.53      0.39      0.44       135
     confusion       0.54      0.37      0.44       153
     curiosity       0.50      0.54      0.52       284
        desire       0.72      0.37      0.49        83
disappointment       0.53      0.12      0.19       151
   disapproval       0.46      0.37      0.41       267
       disgust       0.60      0.40      0.48       123
 embarrassment       0.71      0.32      0.44        37
    excitement       0.50      0.34      0.40       103
          fear       0.60      0.72      0.65        78
 

## Domain Adaptation: Presidential Speeches
Fine-tune the best DeBERTa model on the presidential speeches dataset.

In [9]:
#import train test split
from sklearn.model_selection import train_test_split

In [10]:
# Load Presidential Data
pres_df = pd.read_csv("data/presidential_speeches_goemotions_labeled.csv")
print(f"Presidential Dataset Shape: {pres_df.shape}")

# Identify text column
text_col = None
for col in ['speech', 'Speech', 'transcript', 'Transcript', 'text', 'Text', 'content']:
    if col in pres_df.columns:
        text_col = col
        break

if text_col is None:
    raise ValueError(f"Could not find text column. Available columns: {pres_df.columns.tolist()}")

print(f"Using text column: {text_col}")

# Train-test split
pres_train_df, pres_test_df = train_test_split(pres_df, test_size=0.2, random_state=42)

# Define Dataset class for Presidential data
class PresidentialDataset(Dataset):
    def __init__(self, dataframe, tokenizer, text_col, max_length=256):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_col = text_col
        self.labels = LABELS
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.loc[idx, self.text_col])
        if pd.isna(text):
            text = ""
        labels = self.data.loc[idx, self.labels].values.astype(float)
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Create DataLoaders
pres_train_dataset = PresidentialDataset(pres_train_df, tokenizer, text_col)
pres_test_dataset = PresidentialDataset(pres_test_df, tokenizer, text_col)

pres_train_loader = DataLoader(pres_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
pres_test_loader = DataLoader(pres_test_dataset, batch_size=BATCH_SIZE)

Presidential Dataset Shape: (995, 67)
Using text column: speech


In [11]:
# Load the best DeBERTa model
model.load_state_dict(torch.load('best_deberta_model.pt'))
model.to(device)

# Fine-tuning parameters
FINE_TUNE_LR = 1e-5
FINE_TUNE_EPOCHS = 3

optimizer = AdamW(model.parameters(), lr=FINE_TUNE_LR, weight_decay=0.01)
total_steps = len(pres_train_loader) * FINE_TUNE_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

print(f"Fine-tuning DeBERTa on presidential data for {FINE_TUNE_EPOCHS} epochs...")

best_pres_f1 = 0

for epoch in range(FINE_TUNE_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{FINE_TUNE_EPOCHS}")
    
    train_loss = train_epoch(model, pres_train_loader, optimizer, scheduler, device)
    print(f"Training Loss: {train_loss:.4f}")
    
    val_results = evaluate(model, pres_test_loader, device)
    print(f"Test Loss: {val_results['loss']:.4f}")
    print(f"Test F1 (micro): {val_results['f1_micro']:.4f}")
    print(f"Test F1 (macro): {val_results['f1_macro']:.4f}")
    
    if val_results['f1_macro'] > best_pres_f1:
        best_pres_f1 = val_results['f1_macro']
        torch.save(model.state_dict(), 'best_presidential_deberta_model.pt')
        print("Saved best presidential model!")

print(f"\nBest Presidential Test F1 (macro): {best_pres_f1:.4f}")

Fine-tuning DeBERTa on presidential data for 3 epochs...

Epoch 1/3


Training: 100%|██████████| 50/50 [00:14<00:00,  3.35it/s]


Training Loss: 0.0594


Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.84it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test Loss: 0.0527
Test F1 (micro): 0.7169
Test F1 (macro): 0.0404
Saved best presidential model!

Epoch 2/3


Training: 100%|██████████| 50/50 [00:13<00:00,  3.83it/s]


Training Loss: 0.0527


Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.61it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test Loss: 0.0528
Test F1 (micro): 0.6883
Test F1 (macro): 0.0394

Epoch 3/3


Training: 100%|██████████| 50/50 [00:14<00:00,  3.37it/s]


Training Loss: 0.0497


Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.44it/s]

Test Loss: 0.0524
Test F1 (micro): 0.6868
Test F1 (macro): 0.0394

Best Presidential Test F1 (macro): 0.0404



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
# Load best presidential model
model.load_state_dict(torch.load('best_presidential_deberta_model.pt'))

final_results = evaluate(model, pres_test_loader, device)

print("="*60)
print("FINAL PRESIDENTIAL DEBERTA MODEL RESULTS")
print("="*60)
print(f"Test Loss: {final_results['loss']:.4f}")
print(f"Test F1 (micro): {final_results['f1_micro']:.4f}")
print(f"Test F1 (macro): {final_results['f1_macro']:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(
    final_results['labels'], 
    final_results['predictions'], 
    target_names=LABELS,
    zero_division=0
))

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.50it/s]

FINAL PRESIDENTIAL DEBERTA MODEL RESULTS
Test Loss: 0.0527
Test F1 (micro): 0.7169
Test F1 (macro): 0.0404

Detailed Classification Report:
                precision    recall  f1-score   support

    admiration       0.00      0.00      0.00         3
     amusement       0.00      0.00      0.00         0
         anger       0.00      0.00      0.00         0
     annoyance       0.00      0.00      0.00         0
      approval       0.00      0.00      0.00        36
        caring       0.00      0.00      0.00         1
     confusion       0.00      0.00      0.00         0
     curiosity       0.00      0.00      0.00         1
        desire       0.00      0.00      0.00         2
disappointment       0.00      0.00      0.00         0
   disapproval       0.00      0.00      0.00         0
       disgust       0.00      0.00      0.00         0
 embarrassment       0.00      0.00      0.00         0
    excitement       0.00      0.00      0.00         0
          fear     


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
