In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from torch.optim import AdamW

# Load and prepare your new dataset
df = pd.read_csv("C:/Users/jack/Downloads/cropped_df.csv")

# Drop rows where 'text' or 'main_category' is NaN
df = df.dropna(subset=['text', 'main_category'])

# Encode 'main_category' (target variable)
label_encoder = LabelEncoder()
df['main_category'] = label_encoder.fit_transform(df['main_category'])

# Use the 'text' column for input text and 'main_category' for the target
texts = df['text'].tolist()
labels = df['main_category'].tolist()

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenizer and Dataset preparation
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        
        # Handle potential NaN text values by replacing them with an empty string
        if pd.isna(text):
            text = ""

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets and dataloaders
train_dataset = EmotionDataset(X_train, y_train, tokenizer)
val_dataset = EmotionDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model Initialization
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Optimizer and device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
epochs = 10

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            _, preds = torch.max(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Evaluation metrics
    print(f'Epoch {epoch+1}/{epochs}')
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
    print('Confusion Matrix:')
    print(confusion_matrix(all_labels, all_preds))

# After training, save the model if needed
model.save_pretrained("emotion_classifier_model")
tokenizer.save_pretrained("emotion_classifier_tokenizer")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
              precision    recall  f1-score   support

       anger       0.85      0.87      0.86      1346
     disgust       0.98      1.00      0.99      1287
        fear       0.89      0.79      0.84      1304
   happiness       0.92      0.84      0.88      1342
     neutral       0.77      0.95      0.85      1325
     sadness       0.91      0.81      0.86      1332
    surprise       0.79      0.81      0.80      1339

    accuracy                           0.87      9275
   macro avg       0.87      0.87      0.87      9275
weighted avg       0.87      0.87      0.87      9275

Confusion Matrix:
[[1170    7   47   35   27   41   19]
 [   0 1281    0    0    6    0    0]
 [  57    3 1026    7   66   17  128]
 [  44    0   17 1133   57   37   54]
 [   2   11    5    0 1263    2   42]
 [  88    5   45   45   28 1082   39]
 [  13    2   13   14  197   13 1087]]
Epoch 2/10
              precision    recall  f1-score   support

       anger       0.93      0.87      0.

KeyboardInterrupt: 