In [3]:
!pip install pandas numpy seaborn scikit-learn neattext joblib


Collecting pandas
  Using cached pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.2.5-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting neattext
  Using cached neattext-0.1.3-py3-none-any.whl.metadata (12 kB)
Collecting joblib
  Using cached joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Using cached matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolc

In [2]:
# TEXT EMOTION DETECTION with Multiple Model Comparison and Proper Model Saving/Loading

import os
import pandas as pd
import neattext.functions as nfx

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib

MODEL_DIR = "models"

# Ensure the models directory exists
os.makedirs(MODEL_DIR, exist_ok=True)

# Load dataset (update path if needed)
df = pd.read_csv("data/emotion_dataset_raw.csv")


# Clean the text data
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles).apply(nfx.remove_stopwords)

# Features and labels
X = df['Clean_Text']
y = df['Emotion']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the model pipelines
pipelines = {
    "Logistic Regression": Pipeline([
        ('cv', CountVectorizer()),
        ('lr', LogisticRegression(max_iter=1000))
    ]),
    "SVM": Pipeline([
        ('cv', CountVectorizer()),
        ('svc', SVC(probability=True))
    ]),
    "Random Forest": Pipeline([
        ('cv', CountVectorizer()),
        ('rf', RandomForestClassifier(n_estimators=100))
    ])
}

# Train all models and evaluate accuracy
model_scores = {}
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    acc = pipe.score(X_test, y_test)
    model_scores[name] = (pipe, acc)
    print(f"{name} Accuracy: {acc:.4f}")

# Save the trained models
for name, (model, _) in model_scores.items():
    filename = os.path.join(MODEL_DIR, f"pipe_{name.lower().replace(' ', '_')}.pkl")
    joblib.dump(model, filename)

# Function to load models (optional)
def load_models(model_dir=MODEL_DIR):
    loaded_models = {}
    for file in os.listdir(model_dir):
        if file.endswith(".pkl"):
            model_name = file.replace("pipe_", "").replace(".pkl", "").replace("_", " ").title()
            loaded_models[model_name] = joblib.load(os.path.join(model_dir, file))
    return loaded_models

# Load models back if needed
# loaded_models = load_models()

# Use the already trained models and accuracies for prediction
def predict_best_model(text):
    # Select the best model by accuracy
    best_model_name = max(model_scores, key=lambda k: model_scores[k][1])
    best_model, best_acc = model_scores[best_model_name]
    prediction = best_model.predict([text])[0]
    print(f"Best Model: {best_model_name} (Accuracy: {best_acc:.4f})")
    print(f"Prediction: {prediction}")
    return best_model_name, prediction

# Test the prediction function
predict_best_model("I am feeling very happy today!")


Logistic Regression Accuracy: 0.6210
SVM Accuracy: 0.6154
Random Forest Accuracy: 0.5821
Best Model: Logistic Regression (Accuracy: 0.6210)
Prediction: joy


('Logistic Regression', 'joy')