In [8]:
import re
from collections import Counter
import time
from collections import defaultdict
from torchmetrics.classification import (
    MulticlassAccuracy,
    MulticlassPrecision,
    MulticlassF1Score,
    MulticlassRecall,
)
import torch

In [9]:
emotions = ["love", "fear", "sadness", "surprise", "joy", "anger"]

stop_words = set(
    [
        "i",
        "me",
        "my",
        "myself",
        "we",
        "our",
        "ours",
        "ourselves",
        "you",
        "you're",
        "you've",
        "you'll",
        "you'd",
        "your",
        "yours",
        "yourself",
        "yourselves",
        "he",
        "him",
        "his",
        "himself",
        "she",
        "she's",
        "her",
        "hers",
        "herself",
        "it",
        "it's",
        "its",
        "itself",
        "they",
        "them",
        "their",
        "theirs",
        "themselves",
        "what",
        "which",
        "who",
        "whom",
        "this",
        "that",
        "that'll",
        "these",
        "those",
        "am",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
        "do",
        "does",
        "did",
        "doing",
        "a",
        "an",
        "the",
        "and",
        "but",
        "if",
        "or",
        "because",
        "as",
        "until",
        "while",
        "of",
        "at",
        "by",
        "for",
        "with",
        "about",
        "against",
        "between",
        "into",
        "through",
        "during",
        "before",
        "after",
        "above",
        "below",
        "to",
        "from",
        "up",
        "down",
        "in",
        "out",
        "on",
        "off",
        "over",
        "under",
        "again",
        "further",
        "then",
        "once",
        "here",
        "there",
        "when",
        "where",
        "why",
        "how",
        "all",
        "any",
        "both",
        "each",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "no",
        "nor",
        "not",
        "only",
        "own",
        "same",
        "so",
        "than",
        "too",
        "very",
        "s",
        "t",
        "can",
        "will",
        "just",
        "don",
        "don't",
        "should",
        "should've",
        "now",
        "d",
        "ll",
        "m",
        "o",
        "re",
        "ve",
        "y",
        "ain",
        "aren",
        "aren't",
        "couldn",
        "couldn't",
        "didn",
        "didn't",
        "doesn",
        "doesn't",
        "hadn",
        "hadn't",
        "hasn",
        "hasn't",
        "haven",
        "haven't",
        "isn",
        "isn't",
        "ma",
        "mightn",
        "mightn't",
        "mustn",
        "mustn't",
        "needn",
        "needn't",
        "shan",
        "shan't",
        "shouldn",
        "shouldn't",
        "wasn",
        "wasn't",
        "weren",
        "weren't",
        "won",
        "won't",
        "wouldn",
        "wouldn't",
        "feeling",
        "feel",
        "really",
        "im",
        "like",
        "know",
        "get",
        "ive",
        "im'",
        "stil",
        "even",
        "time",
        "want",
        "one",
        "cant",
        "think",
        "go",
        "much",
        "never",
        "day",
        "back",
        "see",
        "still",
        "make",
        "thing",
        "would",
        "would'",
        "could'",
        "little",
    ]
)



In [10]:


def create_lexicon(file_path, counter_most_common):

    emotion_counters = {emotion: Counter() for emotion in emotions}

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            text, emotion = line.strip().split(";")
            if emotion in emotions:
                words = [
                    word
                    for word in re.findall(r"\w+", text.lower())
                    if word not in stop_words
                ]
                emotion_counters[emotion].update(words)

    emotion_lexicon = {
        emotion: [word for word, _ in counter.most_common(counter_most_common)]
        for emotion, counter in emotion_counters.items()
    }

    return emotion_lexicon



In [11]:
def predict(file_path, lexicon):
    predictions = []
    labels = []

    lexicon = {
        emotion: set(map(str.lower, words)) for emotion, words in lexicon.items()
    }

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            text, actual_emotion = line.strip().split(";")
            words = set(re.findall(r"\w+", text.lower()))

            emotion_scores = defaultdict(int)

            for emotion, emotion_words in lexicon.items():
                common_words = words & emotion_words
                emotion_scores[emotion] += len(common_words)

            predicted_emotion = max(emotion_scores, key=emotion_scores.get)

            predictions.append(predicted_emotion)
            labels.append(actual_emotion)

    return predictions, labels

In [12]:
def evaluate_results(predictions, labels, num_classes: int):
    accuracy_metric = MulticlassAccuracy(num_classes=num_classes, average="micro")
    precision_metric = MulticlassPrecision(num_classes=num_classes, average="macro")
    recall_metric = MulticlassRecall(num_classes=num_classes, average="macro")
    f1_metric = MulticlassF1Score(num_classes=num_classes, average="macro")

    predictions = [emotions.index(prediction) for prediction in predictions]
    labels = [emotions.index(label) for label in labels]

    predictions = torch.tensor(predictions)
    labels = torch.tensor(labels)

    accuracy = accuracy_metric(predictions, labels)
    precision = precision_metric(predictions, labels)
    recall = recall_metric(predictions, labels)
    f1 = f1_metric(predictions, labels)

    return accuracy, precision, recall, f1

In [13]:
def run():
    train_file_path = "data/train.txt"
    test_file_path = "data/test.txt"

    # Hyperparameters:
    # increment_step = 5
    # max_word_count = 2000

    # for counter_most_common in range(5, max_word_count, increment_step):
    counter_most_common = 350
    start = time.time()
    lexicon = create_lexicon(train_file_path, counter_most_common)
    train_time = time.time() - start
    start = time.time()
    predictions, labels = predict(test_file_path, lexicon)
    inference_time = time.time() - start

    accuracy, precision, recall, f1 = evaluate_results(
        predictions, labels, num_classes=len(emotions)
    )

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1: {f1:.2f}")

    print(f"Train Time: {train_time:.2f} seconds")
    print(f"Inference Time: {inference_time:.2f} seconds")

    print("End")

In [14]:
run()

Accuracy: 0.60
Precision: 0.60
Recall: 0.63
F1: 0.55
Train Time: 0.13 seconds
Inference Time: 0.02 seconds
End
