In [252]:
import os
import time
from collections import Counter

import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torchmetrics.classification import (
    MulticlassAccuracy,
    MulticlassPrecision,
    MulticlassF1Score,
    MulticlassRecall,
)

In [253]:
# Parameters
BATCH_SIZE = 128
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
EPOCHS = 100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [254]:
def load_data(file_path):
    df = pd.read_csv(file_path, delimiter=";", header=None, names=["text", "emotion"])
    return df["text"].values, df["emotion"].values


# Tokenize and encode labels
def tokenize(text, word_to_idx):
    return [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in text.split()]


def encode_labels(labels):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(labels), label_encoder

In [255]:
train_texts, train_labels = load_data("data/train.txt")
test_texts, test_labels = load_data("data/test.txt")

# Build vocabulary
word_counter = Counter()
for text in train_texts:
    word_counter.update(text.split())
vocab = ["<PAD>", "<UNK>"] + [word for word, freq in word_counter.items() if freq > 1]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)


train_labels, label_encoder = encode_labels(train_labels)
test_labels, _ = encode_labels(test_labels)

In [256]:
# Dataset
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx, device=None):
        """
        Args:
            texts (list of list of str): List of tokenized sentences.
            labels (list of int): List of corresponding labels.
            word_to_idx (dict): Dictionary mapping words to indices.
            device (torch.device, optional): Device to store the tensors on (e.g., 'cuda' or 'cpu').
        """
        self.device = device if device is not None else torch.device("cpu")

        # Preprocess texts and labels into tensors directly
        # token_texts = []
        # for text in texts:
        #     tokens = tokenize(text, word_to_idx)
        #     token_texts.append(torch.tensor(tokens, dtype=torch.float32, device=self.device))
        # self.texts = token_texts

        self.texts = [
            torch.tensor(tokenize(text, word_to_idx), device=device) for text in texts
        ]
        self.labels = torch.tensor(labels, dtype=torch.long, device=self.device)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Return preprocessed tensors
        return self.texts[idx], self.labels[idx]

In [257]:

# LSTM Model
class LSTMEmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMEmotionClassifier, self).__init__()
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            # , padding_idx=word_to_idx["<PAD>"]
        )
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]
        return self.fc(x)


In [258]:
# Padding function
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=word_to_idx["<PAD>"])
    return texts.to(device), torch.tensor(labels).to(device)

In [259]:
def calc_metrics(predictions, labels, num_classes: int):
    accuracy_metric = MulticlassAccuracy(num_classes=num_classes, average="micro")
    precision_metric = MulticlassPrecision(num_classes=num_classes, average="macro")
    recall_metric = MulticlassRecall(num_classes=num_classes, average="macro")
    f1_metric= MulticlassF1Score(num_classes=num_classes, average="macro")

    predictions = torch.tensor(predictions)
    labels = torch.tensor(labels)

    accuracy = accuracy_metric(predictions, labels)
    precision = precision_metric(predictions, labels)
    recall = recall_metric(predictions, labels)
    f1 = f1_metric(predictions, labels)

    return accuracy, precision, recall, f1

In [260]:
# Evaluate the model
def evaluate_model(model: nn.Module, data_loader: DataLoader, num_classes: int):
    model.eval()

    predicted_acum = []
    labels_acum = []
    with torch.no_grad():
        for texts, labels in data_loader:
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            predicted_acum.extend(predicted.tolist())
            labels_acum.extend(labels.tolist())

    accuracy, precision, recall, f1 = calc_metrics(
        predicted_acum, labels_acum, num_classes
    )

    print(f"Accuracy: {accuracy*100:.2f}%")
    print(f"Precision: {precision*100:.2f}%")
    print(f"Recall: {recall*100:.2f}%")
    print(f"F1 score: {f1*100:.2f}%")

In [261]:


# Data Loaders
train_dataset = EmotionDataset(train_texts, train_labels, word_to_idx, device)
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)

test_dataset = EmotionDataset(test_texts, test_labels, word_to_idx, device)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)

In [None]:
def train():
    # Model initialization
    model = LSTMEmotionClassifier(
        vocab_size, EMBEDDING_DIM, HIDDEN_DIM, len(label_encoder.classes_)
    )
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(EPOCHS):
        start = time.time()
        for texts, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        end = time.time()
        epoch_time = end - start
        # print(f"Epoch {epoch+1}, Loss: {loss.item()}, Time: {epoch_time:.2f}s")
    
    return model
    # Save the model to disk
    # torch.save(model.state_dict(), "lstm_emotion_classifier_model.pth")
    # evaluate_model(model, test_loader, len(label_encoder.classes_))

In [263]:
times = []
for i in range(1):
    start = time.time()
    model = train()
    end = time.time()
    times.append(end-start)

torch.save(model.state_dict(), "lstm_emotion_classifier_model.pth")
print(f"Average training time: {sum(times)/len(times):.2f}s")

Epoch 1, Loss: 1.5801793336868286, Time: 0.64s
Epoch 2, Loss: 1.5696349143981934, Time: 0.59s
Epoch 3, Loss: 1.5548969507217407, Time: 0.57s
Epoch 4, Loss: 1.5812246799468994, Time: 0.55s
Epoch 5, Loss: 1.6175702810287476, Time: 0.55s
Epoch 6, Loss: 1.6254268884658813, Time: 0.54s
Epoch 7, Loss: 1.550646185874939, Time: 0.53s
Epoch 8, Loss: 1.3567177057266235, Time: 0.54s
Epoch 9, Loss: 0.9782931208610535, Time: 0.56s
Epoch 10, Loss: 0.6881482601165771, Time: 0.53s
Epoch 11, Loss: 0.5787520408630371, Time: 0.54s
Epoch 12, Loss: 0.45572108030319214, Time: 0.57s
Epoch 13, Loss: 0.27451714873313904, Time: 0.68s
Epoch 14, Loss: 0.2064484804868698, Time: 0.68s
Epoch 15, Loss: 0.21109049022197723, Time: 0.69s
Epoch 16, Loss: 0.1710818111896515, Time: 0.58s
Epoch 17, Loss: 0.11758346855640411, Time: 0.58s
Epoch 18, Loss: 0.07628409564495087, Time: 0.58s
Epoch 19, Loss: 0.055165719240903854, Time: 0.61s
Epoch 20, Loss: 0.05386904627084732, Time: 0.57s
Epoch 21, Loss: 0.06495389342308044, Time:

In [264]:
def validate():
    # Validate the loaded model
    loaded_model = LSTMEmotionClassifier(
        vocab_size, EMBEDDING_DIM, HIDDEN_DIM, len(label_encoder.classes_)
    )
    loaded_model.load_state_dict(torch.load("lstm_emotion_classifier_model.pth"))
    loaded_model.to(device)
    # Load and process validation data
    val_texts, val_labels = load_data(os.path.join(os.getcwd(), "data/val.txt"))

    val_labels = label_encoder.transform(val_labels)
    print(val_labels)
    val_dataset = EmotionDataset(val_texts, val_labels, word_to_idx, device)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Evaluate the loaded model on validation data
    evaluate_model(loaded_model, val_loader, len(label_encoder.classes_))

In [265]:
validate()

[4 4 3 ... 2 2 2]
Accuracy: 90.75%
Precision: 87.51%
Recall: 86.23%
F1 score: 86.79%


In [270]:
def predict():
    loaded_model = LSTMEmotionClassifier(
        vocab_size, EMBEDDING_DIM, HIDDEN_DIM, len(label_encoder.classes_)
    )
    loaded_model.load_state_dict(torch.load("lstm_emotion_classifier_model.pth"))
    loaded_model.to(device)
    # Load and process validation data
    val_texts, val_labels = load_data(os.path.join(os.getcwd(), "data/val.txt"))

    val_labels = label_encoder.transform(val_labels)
    val_dataset = EmotionDataset(val_texts, val_labels, word_to_idx, device)
    val_loader = DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
    )

    start = time.time()
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = loaded_model(texts)
            _, predicted_labels = torch.max(outputs.data, 1)

    end = time.time()

    return end - start

In [271]:
prediction_times = []
for i in range(100):
    prediction_times.append(predict())

print(f"Average validation time: {sum(prediction_times)/len(prediction_times):.5f}s")

Average validation time: 0.04673s
