In [1]:
# 1. Imports
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
# 2. Safe vector parser
def parse_vector(v):
    if isinstance(v, str):
        return list(map(float, re.findall(r"[-+]?\d*\.\d+|\d+", v)))
    return v

def safe_vector_list(series, expected_dim):
    return np.vstack(series.apply(parse_vector).apply(lambda x: x if len(x) == expected_dim else np.zeros(expected_dim)))

In [3]:
# 3. Load training data
df = pd.read_csv("NLP_features(lr_nb).csv")
df["TF_IDF"] = df["TF_IDF"].apply(parse_vector)
df["Pretrained_Embeddings"] = df["Pretrained_Embeddings"].apply(parse_vector)
df["Custom_Embeddings"] = df["Custom_Embeddings"].apply(parse_vector)
df["POS_Tags"] = df["POS_Tags"].apply(eval)

In [4]:
# 4. Encode labels
label_encoder_emotion = LabelEncoder()
df["emotion_label"] = label_encoder_emotion.fit_transform(df["Emotion"])
emotion_classes = label_encoder_emotion.classes_

In [5]:
# 5. Features
tfidf_features = safe_vector_list(df["TF_IDF"], expected_dim=100)
pretrained_embeddings = safe_vector_list(df["Pretrained_Embeddings"], expected_dim=300)
custom_embeddings = safe_vector_list(df["Custom_Embeddings"], expected_dim=300)
pos_mlb = MultiLabelBinarizer()
pos_encoded = pos_mlb.fit_transform(df["POS_Tags"])
extra_features = df[[
    "Sentiment_Score", "Polarity", "Subjectivity",
    "Exclamations", "Is_Question", "Personal_Pronoun_Count"
]].apply(pd.to_numeric, errors="coerce").fillna(0).values

X_raw = np.hstack([tfidf_features, pos_encoded, extra_features, pretrained_embeddings, custom_embeddings])
y = df["emotion_label"].values

In [6]:
# 6. Scale + split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train_raw, X_val_raw, _, _ = train_test_split(X_raw, y, test_size=0.2, random_state=42)

In [7]:
# 7. Logistic Regression grid search
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["saga"],
    "penalty": ["l2"],
    "max_iter": [2000],
    "class_weight": ["balanced"]
}
lr_search = GridSearchCV(LogisticRegression(), param_grid_lr, scoring="f1_weighted", cv=3, n_jobs=-1, verbose=1)
lr_search.fit(X_train, y_train)
best_lr = lr_search.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits




In [8]:
# 8. Naive Bayes grid search (on raw, non-negative)
X_train_nb = X_train_raw.clip(min=0)
X_val_nb = X_val_raw.clip(min=0)
param_grid_nb = { "alpha": [0.001, 0.01, 0.1, 1.0, 10.0] }
nb_search = GridSearchCV(MultinomialNB(), param_grid_nb, scoring="f1_weighted", cv=3, n_jobs=-1, verbose=1)
nb_search.fit(X_train_nb, y_train)
best_nb = nb_search.best_estimator_

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [9]:
# 9. Load & process test data
df_test = pd.read_csv("NLP_test(lr_nb).csv")
df_test = df_test[~df_test["Emotion"].isin(["[]", "", None])].copy()
df_test["emotion_label"] = label_encoder_emotion.transform(df_test["Emotion"])
df_test["TF_IDF"] = df_test["TF_IDF"].apply(parse_vector)
df_test["Pretrained_Embeddings"] = df_test["Pretrained_Embeddings"].apply(parse_vector)
df_test["Custom_Embeddings"] = df_test["Custom_Embeddings"].apply(parse_vector)
df_test["POS_Tags"] = df_test["POS_Tags"].apply(eval)

test_tfidf = safe_vector_list(df_test["TF_IDF"], expected_dim=100)
test_pretrained = safe_vector_list(df_test["Pretrained_Embeddings"], expected_dim=300)
test_custom = safe_vector_list(df_test["Custom_Embeddings"], expected_dim=300)
test_pos = pos_mlb.transform(df_test["POS_Tags"])
test_extra = df_test[[
    "Sentiment_Score", "Polarity", "Subjectivity",
    "Exclamations", "Is_Question", "Personal_Pronoun_Count"
]].apply(pd.to_numeric, errors="coerce").fillna(0).values

X_test_raw = np.hstack([test_tfidf, test_pos, test_extra, test_pretrained, test_custom])
X_test_scaled = scaler.transform(X_test_raw)
y_test = df_test["emotion_label"].values

In [10]:
# 10. Evaluation helper
def evaluate(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"\n📊 {name} Set")
    print(f"Accuracy:  {acc:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(classification_report(y_true, y_pred, target_names=emotion_classes, zero_division=0))
    return acc, f1

In [11]:
# 11. Predict
val_preds_lr = best_lr.predict(X_val)
test_preds_lr = best_lr.predict(X_test_scaled)

val_preds_nb = best_nb.predict(X_val_nb)
test_preds_nb = best_nb.predict(X_test_raw.clip(min=0))

In [12]:
# 12. Report
val_acc_lr, val_f1_lr = evaluate("Validation (LogReg)", y_val, val_preds_lr)
test_acc_lr, test_f1_lr = evaluate("Test (LogReg)", y_test, test_preds_lr)

val_acc_nb, val_f1_nb = evaluate("Validation (NaiveBayes)", y_val, val_preds_nb)
test_acc_nb, test_f1_nb = evaluate("Test (NaiveBayes)", y_test, test_preds_nb)


📊 Validation (LogReg) Set
Accuracy:  0.7691
F1 Score:  0.7674
              precision    recall  f1-score   support

       anger       0.70      0.81      0.75        68
     disgust       0.78      0.85      0.81        61
        fear       0.81      0.84      0.82        73
   happiness       0.76      0.64      0.69       105
     neutral       0.79      0.72      0.76       154
     sadness       0.73      0.89      0.80        62
    surprise       0.79      0.78      0.79        79

    accuracy                           0.77       602
   macro avg       0.77      0.79      0.78       602
weighted avg       0.77      0.77      0.77       602


📊 Test (LogReg) Set
Accuracy:  0.8713
F1 Score:  0.8710
              precision    recall  f1-score   support

       anger       0.88      0.86      0.87       392
     disgust       0.82      0.90      0.86       324
        fear       0.88      0.94      0.91       317
   happiness       0.88      0.82      0.85       545
     neutral

In [13]:
# 13. Compare
summary_df = pd.DataFrame({
    "Model": ["LogReg", "NaiveBayes", "LogReg", "NaiveBayes"],
    "Dataset": ["Validation", "Validation", "Test", "Test"],
    "Accuracy": [val_acc_lr, val_acc_nb, test_acc_lr, test_acc_nb],
    "F1 Score": [val_f1_lr, val_f1_nb, test_f1_lr, test_f1_nb]
})
display(summary_df)

# Optional: show best hyperparameters
print("\nBest Logistic Regression Params:", lr_search.best_params_)
print("Best Naive Bayes Params:", nb_search.best_params_)

Unnamed: 0,Model,Dataset,Accuracy,F1 Score
0,LogReg,Validation,0.769103,0.767418
1,NaiveBayes,Validation,0.519934,0.540139
2,LogReg,Test,0.871343,0.870984
3,NaiveBayes,Test,0.529588,0.543039



Best Logistic Regression Params: {'C': 10, 'class_weight': 'balanced', 'max_iter': 2000, 'penalty': 'l2', 'solver': 'saga'}
Best Naive Bayes Params: {'alpha': 0.001}
