In [7]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from torch.optim import AdamW

# Load and prepare your dataset
df = pd.read_excel('C:/Users/jack/Downloads/FINAL_DATASET (2).xlsx')

# Drop rows where 'Translation' or 'Corrected_Emotion' is NaN
df = df.dropna(subset=['Translation', 'Corrected_Emotion'])

# Encode 'Corrected_Emotion' (target variable)
label_encoder = LabelEncoder()
df['Corrected_Emotion'] = label_encoder.fit_transform(df['Corrected_Emotion'])

# Use the 'Translation' column for input text and 'Corrected_Emotion' for the target
texts = df['Translation'].tolist()  # Use Translation column
labels = df['Corrected_Emotion'].tolist()  # Use Corrected_Emotion as the target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenizer and Dataset preparation
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        
        # Handle potential NaN text values by replacing them with an empty string
        if pd.isna(text):
            text = ""  # Or some placeholder text if preferred

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets and dataloaders
train_dataset = EmotionDataset(X_train, y_train, tokenizer)
val_dataset = EmotionDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model Initialization
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Optimizer and device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            _, preds = torch.max(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Evaluation metrics
    print(f'Epoch {epoch+1}/{epochs}')
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
    print('Confusion Matrix:')
    print(confusion_matrix(all_labels, all_preds))

# After training, save the model if needed
model.save_pretrained("emotion_classifier_model")
tokenizer.save_pretrained("emotion_classifier_tokenizer")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        20
     disgust       0.00      0.00      0.00         5
        fear       0.00      0.00      0.00        28
   happiness       0.71      0.45      0.55       122
     neutral       0.73      0.97      0.83       499
     sadness       0.00      0.00      0.00        27
    surprise       0.52      0.25      0.33        69

    accuracy                           0.72       770
   macro avg       0.28      0.24      0.25       770
weighted avg       0.63      0.72      0.66       770

Confusion Matrix:
[[  0   0   0   1  13   0   6]
 [  0   0   0   0   3   0   2]
 [  0   0   0   1  25   0   2]
 [  0   0   0  55  66   0   1]
 [  0   0   0  13 483   0   3]
 [  0   0   0   0  25   0   2]
 [  0   0   0   7  44   1  17]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/3
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        20
     disgust       0.00      0.00      0.00         5
        fear       0.00      0.00      0.00        28
   happiness       0.74      0.60      0.66       122
     neutral       0.77      0.93      0.85       499
     sadness       0.23      0.30      0.26        27
    surprise       0.57      0.29      0.38        69

    accuracy                           0.74       770
   macro avg       0.33      0.30      0.31       770
weighted avg       0.68      0.74      0.70       770

Confusion Matrix:
[[  0   0   0   1  10   4   5]
 [  0   0   0   0   2   2   1]
 [  0   0   0   1  16   8   3]
 [  0   0   0  73  46   2   1]
 [  0   0   0  21 466   8   4]
 [  0   0   0   0  18   8   1]
 [  0   0   0   2  44   3  20]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/3
              precision    recall  f1-score   support

       anger       1.00      0.05      0.10        20
     disgust       0.00      0.00      0.00         5
        fear       0.60      0.11      0.18        28
   happiness       0.67      0.66      0.67       122
     neutral       0.79      0.91      0.84       499
     sadness       0.34      0.37      0.36        27
    surprise       0.61      0.33      0.43        69

    accuracy                           0.74       770
   macro avg       0.57      0.35      0.37       770
weighted avg       0.73      0.74      0.71       770

Confusion Matrix:
[[  1   0   0   2  10   1   6]
 [  0   0   0   0   2   2   1]
 [  0   0   3   1  17   5   2]
 [  0   0   1  81  39   1   0]
 [  0   0   1  34 454   6   4]
 [  0   0   0   0  15  10   2]
 [  0   0   0   3  39   4  23]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('emotion_classifier_tokenizer\\tokenizer_config.json',
 'emotion_classifier_tokenizer\\special_tokens_map.json',
 'emotion_classifier_tokenizer\\vocab.txt',
 'emotion_classifier_tokenizer\\added_tokens.json')