In [None]:

# TEXT EMOTION DETECTION with Multiple Model Comparison

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import neattext.functions as nfx

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib

# Step 2: Load the dataset
df = pd.read_csv("../data/emotion_dataset_raw.csv")

# Step 3: Clean the text
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)

# Step 4: Define features and labels
x = df['Clean_Text']
y = df['Emotion']

# Step 5: Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Step 6: Build models
pipe_lr = Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])
pipe_svm = Pipeline(steps=[('cv', CountVectorizer()), ('svc', SVC(probability=True))])
pipe_rf = Pipeline(steps=[('cv', CountVectorizer()), ('rf', RandomForestClassifier(n_estimators=100))])

# Step 7: Train models
pipe_lr.fit(x_train, y_train)
pipe_svm.fit(x_train, y_train)
pipe_rf.fit(x_train, y_train)

# Step 8: Evaluate models
lr_acc = pipe_lr.score(x_test, y_test)
svm_acc = pipe_svm.score(x_test, y_test)
rf_acc = pipe_rf.score(x_test, y_test)

print("Logistic Regression Accuracy:", lr_acc)
print("SVM Accuracy:", svm_acc)
print("Random Forest Accuracy:", rf_acc)

# Step 9: Save all models
joblib.dump(pipe_lr, "text_emotion_lr.pkl")
joblib.dump(pipe_svm, "text_emotion_svm.pkl")
joblib.dump(pipe_rf, "text_emotion_rf.pkl")

# Step 10: Define prediction function to select best model dynamically
def predict_best_model(text):
    models = {
        "Logistic Regression": (pipe_lr, lr_acc),
        "SVM": (pipe_svm, svm_acc),
        "Random Forest": (pipe_rf, rf_acc)
    }
    # Select model with highest accuracy
    best_model_name = max(models, key=lambda k: models[k][1])
    best_model = models[best_model_name][0]
    prediction = best_model.predict([text])[0]
    print(f"Best Model: {best_model_name} (Accuracy: {models[best_model_name][1]})")
    print(f"Prediction: {prediction}")
    return prediction

# Test the function
predict_best_model("I am feeling very happy today!")
