In [1]:
!pip install pandas numpy seaborn scikit-learn neattext joblib


In [1]:
# Train multiple emotion detection models and save them as .pkl files

import os
import pandas as pd
import neattext.functions as nfx
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import joblib

MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

# Load dataset (adjust path if needed)
df = pd.read_csv("data/emotion_dataset_raw.csv")

# Clean the text
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles).apply(nfx.remove_stopwords)

X = df['Clean_Text']
y = df['Emotion']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define pipelines for models
pipelines = {
    "Naive Bayes": Pipeline([
        ('cv', CountVectorizer()),
        ('nb', MultinomialNB())
    ]),
    "Logistic Regression": Pipeline([
        ('cv', CountVectorizer()),
        ('lr', LogisticRegression(max_iter=1000))
    ]),
    "SVM": Pipeline([
        ('cv', CountVectorizer()),
        ('svc', SVC(probability=True))
    ]),
    "Random Forest": Pipeline([
        ('cv', CountVectorizer()),
        ('rf', RandomForestClassifier(n_estimators=100))
    ])
}

# Train and evaluate
model_scores = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    acc = pipeline.score(X_test, y_test)
    model_scores[name] = (pipeline, acc)
    print(f"{name} Accuracy: {acc:.4f}")

# Save models
for name, (model, _) in model_scores.items():
    filename = os.path.join(MODEL_DIR, f"pipe_{name.lower().replace(' ', '_')}.pkl")
    joblib.dump(model, filename)


Naive Bayes Accuracy: 0.5745
Logistic Regression Accuracy: 0.6210
SVM Accuracy: 0.6154
Random Forest Accuracy: 0.5883
