In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report
import copy

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Load dataset

In [3]:
train_df = pd.read_csv('../data/train.txt', header=None, sep=";")
test_df = pd.read_csv('../data/test.txt', header=None, sep=";")
val_df = pd.read_csv('../data/validation.txt', header=None, sep=";")

In [4]:
train_df.head()

Unnamed: 0,0,1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
train_df[1].value_counts()

1
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

We need to make a encoder for emotions

In [6]:
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df[1])
test_labels = label_encoder.transform(test_df[1])
val_labels = label_encoder.transform(val_df[1])

In [7]:
print(label_encoder.classes_)

['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")
model.eval()

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_texts = train_df[0].tolist()
test_texts = test_df[0].tolist()
val_texts = val_df[0].tolist()

def get_embeddings(texts):
    batch_size = 32
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        encodings = {k: v.to(device) for k, v in encodings.items()}

        with torch.no_grad():
            outputs = model(**encodings)
            cls_embeddings = outputs.last_hidden_state[:, 0]  # CLS token

        all_embeddings.append(cls_embeddings.cpu())

    embeds = torch.cat(all_embeddings)
    return embeds

X_train_tensor = get_embeddings(train_texts)
X_test_tensor = get_embeddings(test_texts)
X_val_tensor = get_embeddings(val_texts)

2025-07-17 14:13:47.790070: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-17 14:13:47.872659: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752754427.920347   30796 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752754427.927959   30796 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752754427.965523   30796 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Dataset & DataLoader

In [9]:
y_train_tensor = torch.tensor(train_labels, dtype=torch.long)
y_test_tensor = torch.tensor(test_labels, dtype=torch.long)
y_val_tensor = torch.tensor(val_labels, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)

### Model

In [10]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=768, num_classes=6):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=256, num_layers=6, batch_first=True)
        self.fc1 = nn.Linear(in_features=256, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        hn = self.relu(hn)
        x = self.fc1(hn[-1])
        x = self.relu(x)
        x = self.dropout(x)
        out = self.fc2(x)
        return out

In [11]:
def train(model, train_loader, optimizer, criterion, device, epochs=10, val_loader=None, patience=5):
    model.to(device)
    best_model_wts = copy.deepcopy(model.state_dict())

    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch.unsqueeze(1))
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

        train_acc = correct / total
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f}", end="")

        if val_loader:
            val_loss, val_acc = evaluate(model, val_loader, criterion, device, return_metrics=True)
            print(f" | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_no_improve = 0
                best_model_wts = model.state_dict()
            else:
                epochs_no_improve += 1

            if epochs_no_improve >= patience:
                model.load_state_dict(best_model_wts)
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(best_model_wts, "../models/best_lstm_model.pth")
                break
        else:
            print()

In [12]:
def evaluate(model, data_loader, criterion, device, label_encoder=None, return_metrics=False):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch.unsqueeze(1))
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    accuracy = correct / total

    if return_metrics:
        avg_loss = total_loss / len(data_loader)
        return avg_loss, accuracy
    else:
        print("\nClassification Report:")
        print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_ if label_encoder else None))

In [13]:
criterion = torch.nn.CrossEntropyLoss()
model = LSTMModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

### Train

In [14]:
train(model, train_loader, optimizer, criterion, device, epochs=50, val_loader=test_loader, patience=5)

Epoch 1/50 | Train Loss: 795.9105 | Train Acc: 0.3231 | Val Loss: 1.5676 | Val Acc: 0.3475
Epoch 2/50 | Train Loss: 791.0013 | Train Acc: 0.3321 | Val Loss: 1.5626 | Val Acc: 0.3475
Epoch 3/50 | Train Loss: 789.7311 | Train Acc: 0.3339 | Val Loss: 1.5675 | Val Acc: 0.3475
Epoch 4/50 | Train Loss: 790.2893 | Train Acc: 0.3320 | Val Loss: 1.5590 | Val Acc: 0.3475
Epoch 5/50 | Train Loss: 708.2081 | Train Acc: 0.4613 | Val Loss: 1.3207 | Val Acc: 0.5025
Epoch 6/50 | Train Loss: 650.2456 | Train Acc: 0.5136 | Val Loss: 1.2415 | Val Acc: 0.5300
Epoch 7/50 | Train Loss: 625.8571 | Train Acc: 0.5319 | Val Loss: 1.2098 | Val Acc: 0.5460
Epoch 8/50 | Train Loss: 606.8732 | Train Acc: 0.5492 | Val Loss: 1.1683 | Val Acc: 0.5725
Epoch 9/50 | Train Loss: 593.7319 | Train Acc: 0.5579 | Val Loss: 1.1966 | Val Acc: 0.5595
Epoch 10/50 | Train Loss: 576.9561 | Train Acc: 0.5681 | Val Loss: 1.1479 | Val Acc: 0.5660
Epoch 11/50 | Train Loss: 565.9292 | Train Acc: 0.5776 | Val Loss: 1.1281 | Val Acc: 0.57

In [15]:
model = LSTMModel()
model.load_state_dict(torch.load("../models/best_lstm_model.pth"))
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
evaluate(model, val_loader, criterion, label_encoder=label_encoder, device=device, return_metrics=False)


Classification Report:
              precision    recall  f1-score   support

       anger       0.51      0.32      0.39       275
        fear       0.49      0.61      0.55       212
         joy       0.66      0.83      0.74       704
        love       0.38      0.19      0.25       178
     sadness       0.63      0.67      0.65       550
    surprise       0.54      0.09      0.15        81

    accuracy                           0.60      2000
   macro avg       0.53      0.45      0.45      2000
weighted avg       0.58      0.60      0.58      2000

