In [None]:
import numpy as np
import pandas as pd
import os

import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/nlp/archive/train.txt", sep=';', header=None, names=["Text", "Emotion"])
test_df = pd.read_csv("/content/drive/MyDrive/nlp/archive/test.txt", sep=';', header=None, names=["Text", "Emotion"])
val_df = pd.read_csv("/content/drive/MyDrive/nlp/archive/val.txt", sep=';', header=None, names=["Text", "Emotion"])

In [None]:
train_df["Emotion"].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937
love,1304
surprise,572


In [None]:
train_df.head(5)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     16000 non-null  object
 1   Emotion  16000 non-null  object
dtypes: object(2)
memory usage: 250.1+ KB


In [None]:
train_df.describe()

Unnamed: 0,Text,Emotion
count,16000,16000
unique,15969,6
top,im still not sure why reilly feels the need to...,joy
freq,2,5362


In [None]:
print(f"Duplicated rows on train.txt: {train_df.duplicated().sum()}")

Duplicated rows on train.txt: 1


In [None]:
train_df.drop_duplicates(inplace=True)

In [None]:
train_df.isna().sum()

Unnamed: 0,0
Text,0
Emotion,0


In [None]:
test_df["Emotion"].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
joy,695
sadness,581
anger,275
fear,224
love,159
surprise,66


In [None]:
test_df.head(5)

Unnamed: 0,Text,Emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     2000 non-null   object
 1   Emotion  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [None]:
test_df.describe()

Unnamed: 0,Text,Emotion
count,2000,2000
unique,2000,6
top,i feel all weird when i have to meet w people ...,joy
freq,1,695


In [None]:
print(f"Duplicated rows on test.txt: {test_df.duplicated().sum()}")

Duplicated rows on test.txt: 0


In [None]:
test_df.isna().sum()

Unnamed: 0,0
Text,0
Emotion,0


In [None]:
val_df["Emotion"].value_counts()

Unnamed: 0_level_0,count
Emotion,Unnamed: 1_level_1
joy,704
sadness,550
anger,275
fear,212
love,178
surprise,81


In [None]:
val_df.head(5)

Unnamed: 0,Text,Emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [None]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     2000 non-null   object
 1   Emotion  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [None]:
val_df.describe()

Unnamed: 0,Text,Emotion
count,2000,2000
unique,1998,6
top,i feel so tortured by it,joy
freq,2,704


In [None]:
print(f"Duplicated rows on val.txt: {val_df.duplicated().sum()}")

Duplicated rows on val.txt: 0


In [None]:
val_df.isna().sum()

Unnamed: 0,0
Text,0
Emotion,0


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("stopwords")

def get_text_length(text):
    return len(word_tokenize(text))

lemma = WordNetLemmatizer()
stemmer = PorterStemmer()
eng_stopwords = stopwords.words('english')

def preprocess(text):
    token = word_tokenize(text)

    token = [word for word in token if word not in string.punctuation and word not in eng_stopwords and word.isalpha()]
    token = [stemmer.stem(lemma.lemmatize(word)) for word in token]

    return token

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_df["TextLength"] = train_df["Text"].apply(lambda x: get_text_length(x))
test_df["TextLength"] = test_df["Text"].apply(lambda x: get_text_length(x))
val_df["TextLength"] = val_df["Text"].apply(lambda x: get_text_length(x))

In [None]:
train_df["TokenizedText"] = train_df["Text"].apply(lambda x: preprocess(x))
test_df["TokenizedText"] = test_df["Text"].apply(lambda x: preprocess(x))
val_df["TokenizedText"] = val_df["Text"].apply(lambda x: preprocess(x))

In [None]:
print(train_df["TokenizedText"])

0                                    [didnt, feel, humili]
1        [go, feel, hopeless, damn, hope, around, someo...
2             [im, grab, minut, post, feel, greedi, wrong]
3        [ever, feel, nostalg, fireplac, know, still, p...
4                                          [feel, grouchi]
                               ...                        
15995    [brief, time, beanbag, said, anna, feel, like,...
15996    [turn, feel, pathet, still, wait, tabl, sub, t...
15997                         [feel, strong, good, overal]
15998                [feel, like, rude, comment, im, glad]
15999                   [know, lot, feel, stupid, portray]
Name: TokenizedText, Length: 15999, dtype: object


In [None]:
print(test_df["TokenizedText"])

0           [im, feel, rather, rotten, im, ambiti, right]
1                         [im, updat, blog, feel, shitti]
2       [never, make, separ, ever, want, feel, like, a...
3       [left, bouquet, red, yellow, tulip, arm, feel,...
4                                [feel, littl, vain, one]
                              ...                        
1995    [keep, feel, like, someon, unkind, wrong, thin...
1996      [im, feel, littl, cranki, neg, doctor, appoint]
1997        [feel, use, peopl, give, great, feel, achiev]
1998    [im, feel, comfort, derbi, feel, though, start...
1999    [feel, weird, meet, w, peopl, text, like, dont...
Name: TokenizedText, Length: 2000, dtype: object


In [None]:
print(val_df["TokenizedText"])

0           [im, feel, quit, sad, sorri, ill, snap, soon]
1       [feel, like, still, look, blank, canva, blank,...
2                            [feel, like, faith, servant]
3                                    [feel, cranki, blue]
4                                   [treat, feel, festiv]
                              ...                        
1995    [im, ssa, examin, tomorrow, morn, im, quit, we...
1996    [constantli, worri, fight, natur, push, limit,...
1997           [feel, import, share, info, experi, thing]
1998    [truli, feel, passion, enough, someth, stay, t...
1999    [feel, like, wan, na, buy, cute, make, see, on...
Name: TokenizedText, Length: 2000, dtype: object


In [None]:
train_df.describe()

Unnamed: 0,TextLength
count,15999.0
mean,19.175761
std,10.992922
min,2.0
25%,11.0
50%,17.0
75%,25.0
max,66.0


In [None]:
test_df.describe()

Unnamed: 0,TextLength
count,2000.0
mean,19.161
std,11.015432
min,3.0
25%,10.75
50%,17.0
75%,26.0
max,61.0


In [None]:
val_df.describe()

Unnamed: 0,TextLength
count,2000.0
mean,18.877
std,10.818058
min,2.0
25%,10.0
50%,17.0
75%,25.0
max,61.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

train_df["PreprocessedText"] = train_df["TokenizedText"].apply(lambda x: " ".join(x))
test_df["PreprocessedText"] = test_df["TokenizedText"].apply(lambda x: " ".join(x))
val_df["PreprocessedText"] = val_df["TokenizedText"].apply(lambda x: " ".join(x))

train_df_matrix = vectorizer.fit_transform(train_df["PreprocessedText"])
test_df_matrix = vectorizer.transform(test_df["PreprocessedText"])
val_df_matrix = vectorizer.transform(val_df["PreprocessedText"])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

model_lr = LogisticRegression(max_iter=1000)
model_svc = SVC()
model_nb = MultinomialNB()
model_rf = RandomForestClassifier()

model_lr.fit(train_df_matrix, train_df["Emotion"])
model_svc.fit(train_df_matrix, train_df["Emotion"])
model_nb.fit(train_df_matrix, train_df["Emotion"])
model_rf.fit(train_df_matrix, train_df["Emotion"])

In [None]:
y_pred_lr = model_lr.predict(test_df_matrix)
y_pred_svc = model_svc.predict(test_df_matrix)
y_pred_nb = model_nb.predict(test_df_matrix)
y_pred_rf = model_rf.predict(test_df_matrix)

In [None]:
from sklearn.metrics import classification_report

def evaluate_model(y_test, y_pred):
    print(classification_report(y_test, y_pred))

In [None]:
print("--- Classification Report of Logistic Regression ---")
evaluate_model(test_df["Emotion"], y_pred_lr)

--- Classification Report of Logistic Regression ---
              precision    recall  f1-score   support

       anger       0.85      0.79      0.82       275
        fear       0.86      0.79      0.82       224
         joy       0.83      0.94      0.88       695
        love       0.76      0.52      0.62       159
     sadness       0.87      0.90      0.89       581
    surprise       0.85      0.52      0.64        66

    accuracy                           0.84      2000
   macro avg       0.84      0.74      0.78      2000
weighted avg       0.84      0.84      0.84      2000



In [None]:
print("--- Classification Report of SVC ---")
evaluate_model(test_df["Emotion"], y_pred_svc)

--- Classification Report of SVC ---
              precision    recall  f1-score   support

       anger       0.85      0.80      0.83       275
        fear       0.83      0.79      0.81       224
         joy       0.82      0.94      0.88       695
        love       0.81      0.49      0.61       159
     sadness       0.88      0.89      0.88       581
    surprise       0.77      0.56      0.65        66

    accuracy                           0.84      2000
   macro avg       0.83      0.74      0.78      2000
weighted avg       0.84      0.84      0.84      2000



In [None]:
print("--- Classification Report of Naive Bayes (Multinomial) ---")
evaluate_model(test_df["Emotion"], y_pred_nb)

--- Classification Report of Naive Bayes (Multinomial) ---
              precision    recall  f1-score   support

       anger       0.93      0.33      0.48       275
        fear       0.92      0.31      0.47       224
         joy       0.65      0.98      0.78       695
        love       1.00      0.07      0.13       159
     sadness       0.68      0.90      0.78       581
    surprise       0.00      0.00      0.00        66

    accuracy                           0.69      2000
   macro avg       0.70      0.43      0.44      2000
weighted avg       0.73      0.69      0.63      2000



In [None]:
print("--- Classification Report of Random Forest ---")
evaluate_model(test_df["Emotion"], y_pred_rf)

--- Classification Report of Random Forest ---
              precision    recall  f1-score   support

       anger       0.83      0.88      0.86       275
        fear       0.79      0.83      0.81       224
         joy       0.88      0.89      0.89       695
        love       0.74      0.65      0.69       159
     sadness       0.92      0.88      0.90       581
    surprise       0.59      0.61      0.60        66

    accuracy                           0.85      2000
   macro avg       0.79      0.79      0.79      2000
weighted avg       0.85      0.85      0.85      2000



## State of the Art Model

In [None]:
!pip install transformers datasets accelerate pandas scikit-learn torch torchvision torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
train_df = train_df.drop(columns=["TextLength", "TokenizedText", "PreprocessedText"], axis=1)
val_df = val_df.drop(columns=["TextLength", "TokenizedText", "PreprocessedText"], axis=1)
test_df = test_df.drop(columns=["TextLength", "TokenizedText", "PreprocessedText"], axis=1)

In [None]:
label_column_name = 'Emotion'

unique_labels = train_df[label_column_name].unique().tolist()
unique_labels.sort()

id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)

print(f"Unique text labels: {unique_labels}")
print(f"label2id mapping: {label2id}")
print(f"id2label mapping: {id2label}")
print(f"Number of unique labels (num_labels): {num_labels}")

train_df['Label'] = train_df[label_column_name].map(label2id)
val_df['Label'] = val_df[label_column_name].map(label2id)
test_df['Label'] = test_df[label_column_name].map(label2id)

train_texts_list = train_df['Text'].tolist()
train_labels_list = train_df['Label'].tolist()

val_texts_list = val_df['Text'].tolist()
val_labels_list = val_df['Label'].tolist()

test_texts_list = test_df['Text'].tolist()
test_labels_list = test_df['Label'].tolist()

Unique text labels: ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
label2id mapping: {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}
id2label mapping: {0: 'anger', 1: 'fear', 2: 'joy', 3: 'love', 4: 'sadness', 5: 'surprise'}
Number of unique labels (num_labels): 6


In [None]:
print("\nDataFrame after mapping labels:")
print(train_df.head())


DataFrame after mapping labels:
                                                Text  Emotion  Label
0                            i didnt feel humiliated  sadness      4
1  i can go from feeling so hopeless to so damned...  sadness      4
2   im grabbing a minute to post i feel greedy wrong    anger      0
3  i am ever feeling nostalgic about the fireplac...     love      3
4                               i am feeling grouchy    anger      0


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BertTokenizer,
    BertForSequenceClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    XLNetTokenizer,
    XLNetForSequenceClassification,
    ElectraTokenizer,
    ElectraForSequenceClassification
)

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)

        acc = accuracy_score(labels, preds)

        precision = precision_score(labels, preds, average='micro', zero_division=0)
        recall = recall_score(labels, preds, average='micro', zero_division=0)
        f1 = f1_score(labels, preds, average='micro', zero_division=0)

        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
        }

In [None]:
def train_and_evaluate_model(model_name, tokenizer_class, model_class):
    print(f"--- Training and Evaluating: {model_name} ---")

    tokenizer = tokenizer_class.from_pretrained(model_name)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    train_encodings = tokenizer(train_texts_list, truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(val_texts_list, truncation=True, padding=True, max_length=128)

    train_dataset = EmotionDataset(train_encodings, train_labels_list)
    val_dataset = EmotionDataset(val_encodings, val_labels_list)

    config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    model = model_class.from_pretrained(model_name, config=config)
    model.resize_token_embeddings(len(tokenizer))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)



    safe_model_name = model_name.replace("/", "_")

    training_args = TrainingArguments(
        output_dir=f'./results/{safe_model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'./logs/{safe_model_name}',
        logging_steps=50,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none",
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    print(f"\n--- Training Result for {model_name} ---")
    trainer.train()

    print(f"\n--- Evaluation Result for {model_name} ---")
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}: {eval_results}")

    model_save_path = f"./fine_tuned_models/{safe_model_name}"
    os.makedirs(model_save_path, exist_ok=True)
    trainer.save_model(model_save_path)

    return model, tokenizer, eval_results

In [None]:
bert_model, bert_tokenizer, bert_results = train_and_evaluate_model(
    model_name='bert-base-uncased',
    tokenizer_class=BertTokenizer,
    model_class=BertForSequenceClassification
)

--- Training and Evaluating: bert-base-uncased ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Training Result for bert-base-uncased ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2677,0.246044,0.9335,0.9335,0.9335,0.9335
2,0.0956,0.168198,0.9395,0.9395,0.9395,0.9395
3,0.0458,0.20828,0.9405,0.9405,0.9405,0.9405



--- Evaluation Result for bert-base-uncased ---


Evaluation results for bert-base-uncased: {'eval_loss': 0.20828036963939667, 'eval_accuracy': 0.9405, 'eval_precision': 0.9405, 'eval_recall': 0.9405, 'eval_f1': 0.9405, 'eval_runtime': 2.8368, 'eval_samples_per_second': 705.029, 'eval_steps_per_second': 44.064, 'epoch': 3.0}


In [None]:
roberta_model, roberta_tokenizer, roberta_results = train_and_evaluate_model(
    model_name='roberta-base',
    tokenizer_class=RobertaTokenizer,
    model_class=RobertaForSequenceClassification
)

--- Training and Evaluating: roberta-base ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Training Result for roberta-base ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1183,1.130909,0.597,0.597,0.597,0.597
2,0.6878,0.660677,0.7925,0.7925,0.7925,0.7925
3,0.3438,0.420555,0.9015,0.9015,0.9015,0.9015



--- Evaluation Result for roberta-base ---


Evaluation results for roberta-base: {'eval_loss': 0.4205547571182251, 'eval_accuracy': 0.9015, 'eval_precision': 0.9015, 'eval_recall': 0.9015, 'eval_f1': 0.9015, 'eval_runtime': 2.3244, 'eval_samples_per_second': 860.425, 'eval_steps_per_second': 53.777, 'epoch': 3.0}


In [None]:
xlnet_model, xlnet_tokenizer, xlnet_results = train_and_evaluate_model(
    model_name='xlnet-base-cased',
    tokenizer_class=XLNetTokenizer,
    model_class=XLNetForSequenceClassification
)

--- Training and Evaluating: xlnet-base-cased ---


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Training Result for xlnet-base-cased ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3212,0.298671,0.9205,0.9205,0.9205,0.9205
2,0.1776,0.192342,0.9375,0.9375,0.9375,0.9375
3,0.1112,0.179995,0.9435,0.9435,0.9435,0.9435


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]


--- Evaluation Result for xlnet-base-cased ---


Evaluation results for xlnet-base-cased: {'eval_loss': 0.17999503016471863, 'eval_accuracy': 0.9435, 'eval_precision': 0.9435, 'eval_recall': 0.9435, 'eval_f1': 0.9435, 'eval_runtime': 4.3453, 'eval_samples_per_second': 460.266, 'eval_steps_per_second': 28.767, 'epoch': 3.0}


In [None]:
electra_model, electra_tokenizer, electra_results = train_and_evaluate_model(
    model_name='google/electra-base-discriminator',
    tokenizer_class=ElectraTokenizer,
    model_class=ElectraForSequenceClassification
)

--- Training and Evaluating: google/electra-base-discriminator ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Training Result for google/electra-base-discriminator ---


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3946,0.336895,0.9145,0.9145,0.9145,0.9145
2,0.217,0.192015,0.939,0.939,0.939,0.939
3,0.1159,0.159589,0.9405,0.9405,0.9405,0.9405


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


--- Evaluation Result for google/electra-base-discriminator ---


Evaluation results for google/electra-base-discriminator: {'eval_loss': 0.15958921611309052, 'eval_accuracy': 0.9405, 'eval_precision': 0.9405, 'eval_recall': 0.9405, 'eval_f1': 0.9405, 'eval_runtime': 2.6771, 'eval_samples_per_second': 747.065, 'eval_steps_per_second': 46.692, 'epoch': 3.0}


In [None]:
def evaluate_fine_tuned_model(model_name, saved_model_path, max_length=128, batch_size=16):
    try:
        loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)
        loaded_tokenizer = AutoTokenizer.from_pretrained(saved_model_path)

        if "xlnet" in model_name.lower() and loaded_tokenizer.pad_token is None:
            loaded_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            loaded_model.resize_token_embeddings(len(loaded_tokenizer))

    except OSError as e:
        print(f"Error: Could not load model from {saved_model_path}. "
              f"Please ensure the path is correct, the model was saved, and library versions are compatible.")
        print(f"Details: {e}")
        raise

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaded_model.to(device)
    loaded_model.eval()

    test_encodings = loaded_tokenizer(test_texts_list, truncation=True, padding=True, max_length=max_length)

    test_dataset = EmotionDataset(test_encodings, test_labels_list)

    prediction_output_dir = os.path.join(saved_model_path, 'test_eval_output')

    pred_args = TrainingArguments(
        output_dir=prediction_output_dir,
        per_device_eval_batch_size=batch_size,
        do_train=False,
        do_eval=False,
        report_to="none",
        fp16=torch.cuda.is_available(),
    )

    tester = Trainer(
        model=loaded_model,
        args=pred_args,
        tokenizer=loaded_tokenizer,
        compute_metrics=compute_metrics
    )

    test_predictions_output = tester.predict(test_dataset)

    for key, value in test_predictions_output.metrics.items():
        print(f"  {key}: {value:.4f}")

    logits_final = test_predictions_output.predictions
    predicted_labels_ids_final = np.argmax(logits_final, axis=1)
    true_labels_ids_final = test_predictions_output.label_ids

    target_names_report = [id2label[i] for i in range(num_labels)]
    print("\n --- Classification Report --- ")
    print(classification_report(true_labels_ids_final, predicted_labels_ids_final, target_names=target_names_report, zero_division=0))

    return loaded_model, loaded_tokenizer, test_predictions_output

In [None]:
print("--- Evaluate Fine-Tuned BERT Model ---")
_, _, bert_test_results = evaluate_fine_tuned_model(
    model_name="BERT",
    saved_model_path="./fine_tuned_models/bert-base-uncased",
)

print("\n--- Evaluate Fine-Tuned RoBERTa Model ---")
_, _, roberta_test_results = evaluate_fine_tuned_model(
    model_name="RoBERTa",
    saved_model_path="./fine_tuned_models/roberta-base",
)

print("\n--- Evaluate Fine-Tuned XLNet Model ---")
_, _, xlnet_test_results = evaluate_fine_tuned_model(
    model_name="XLNet",
    saved_model_path="./fine_tuned_models/xlnet-base-cased",
)

print("\n--- Evaluate Fine-Tuned ELECTRA Model ---")
_, _, electra_test_results = evaluate_fine_tuned_model(
    model_name="ELECTRA",
    saved_model_path="./fine_tuned_models/google_electra-base-discriminator",
)

--- Evaluate Fine-Tuned BERT Model ---


  test_loss: 0.2352
  test_model_preparation_time: 0.0052
  test_accuracy: 0.9285
  test_precision: 0.9285
  test_recall: 0.9285
  test_f1: 0.9285
  test_runtime: 3.8776
  test_samples_per_second: 515.7870
  test_steps_per_second: 32.2370

 --- Classification Report --- 
              precision    recall  f1-score   support

       anger       0.94      0.93      0.93       275
        fear       0.88      0.91      0.89       224
         joy       0.95      0.95      0.95       695
        love       0.82      0.81      0.81       159
     sadness       0.97      0.97      0.97       581
    surprise       0.73      0.77      0.75        66

    accuracy                           0.93      2000
   macro avg       0.88      0.89      0.88      2000
weighted avg       0.93      0.93      0.93      2000


--- Evaluate Fine-Tuned RoBERTa Model ---


  test_loss: 0.3771
  test_model_preparation_time: 0.0052
  test_accuracy: 0.9055
  test_precision: 0.9055
  test_recall: 0.9055
  test_f1: 0.9055
  test_runtime: 3.4158
  test_samples_per_second: 585.5200
  test_steps_per_second: 36.5950

 --- Classification Report --- 
              precision    recall  f1-score   support

       anger       0.88      0.90      0.89       275
        fear       0.97      0.79      0.87       224
         joy       0.93      0.95      0.94       695
        love       0.77      0.72      0.74       159
     sadness       0.94      0.96      0.95       581
    surprise       0.66      0.79      0.72        66

    accuracy                           0.91      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.91      0.91      0.90      2000


--- Evaluate Fine-Tuned XLNet Model ---


  test_loss: 0.2273
  test_model_preparation_time: 0.0024
  test_accuracy: 0.9275
  test_precision: 0.9275
  test_recall: 0.9275
  test_f1: 0.9275
  test_runtime: 4.3339
  test_samples_per_second: 461.4760
  test_steps_per_second: 28.8420

 --- Classification Report --- 
              precision    recall  f1-score   support

       anger       0.92      0.92      0.92       275
        fear       0.87      0.88      0.88       224
         joy       0.95      0.95      0.95       695
        love       0.83      0.87      0.85       159
     sadness       0.97      0.96      0.97       581
    surprise       0.75      0.74      0.75        66

    accuracy                           0.93      2000
   macro avg       0.88      0.89      0.89      2000
weighted avg       0.93      0.93      0.93      2000


--- Evaluate Fine-Tuned ELECTRA Model ---


  test_loss: 0.1680
  test_model_preparation_time: 0.0030
  test_accuracy: 0.9355
  test_precision: 0.9355
  test_recall: 0.9355
  test_f1: 0.9355
  test_runtime: 2.6022
  test_samples_per_second: 768.5910
  test_steps_per_second: 48.0370

 --- Classification Report --- 
              precision    recall  f1-score   support

       anger       0.90      0.94      0.92       275
        fear       0.92      0.91      0.91       224
         joy       0.95      0.97      0.96       695
        love       0.87      0.80      0.83       159
     sadness       0.97      0.97      0.97       581
    surprise       0.87      0.73      0.79        66

    accuracy                           0.94      2000
   macro avg       0.91      0.88      0.90      2000
weighted avg       0.93      0.94      0.93      2000

