In [None]:
# --------------- Emotion Detection from Text ---------------
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import unique_labels

# ---------------- STEP 1: Load Dataset ----------------
df = pd.read_csv("train.txt.csv", sep=';', header=None, names=['text', 'emotion'])
print(f"✅ Loaded {len(df)} samples")

# ---------------- STEP 2: Preprocess Text ----------------
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(preprocess)

# ---------------- STEP 3: Remove Rare Classes ----------------
# Drop classes with only 1 sample
df = df.groupby('emotion').filter(lambda x: len(x) > 1)
df = df.reset_index(drop=True)

# ---------------- STEP 4: Encode Emotions ----------------
label_encoder = LabelEncoder()
df['emotion'] = df['emotion'].astype(str)
df['emotion_encoded'] = label_encoder.fit_transform(df['emotion'])

# Optional: Check class distribution
print("\n📊 Class Distribution After Filtering:")
print(df['emotion'].value_counts())

# ---------------- STEP 5: Train-Test Split ----------------
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['emotion_encoded'],
    test_size=0.2, random_state=42, stratify=df['emotion_encoded'])

# ---------------- STEP 6: TF-IDF Vectorization ----------------
vectorizer = TfidfVectorizer(max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ---------------- STEP 7: Train the Model ----------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# ---------------- STEP 8: Predict ----------------
y_pred = model.predict(X_test_vec)

# ---------------- STEP 9: Evaluate with Fixed Labels ----------------
print("\n✅ Sanity Check:")
print("y_test classes:", len(np.unique(y_test)))
print("y_pred classes:", len(np.unique(y_pred)))
print("label_encoder.classes_:", len(label_encoder.classes_))

valid_labels = sorted(unique_labels(y_test, y_pred))
target_names = label_encoder.inverse_transform(valid_labels)

print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred, labels=valid_labels, target_names=target_names))

# ---------------- STEP 10: Confusion Matrix ----------------
def plot_confusion(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
                xticklabels=label_encoder.inverse_transform(labels),
                yticklabels=label_encoder.inverse_transform(labels))
    plt.title("Emotion Detection - Confusion Matrix")
    plt.xlabel("Predicted Emotion")
    plt.ylabel("Actual Emotion")
    plt.tight_layout()
    plt.show()

plot_confusion(y_test, y_pred, valid_labels)

# ---------------- STEP 11: Predict Custom Text ----------------
def predict_emotion(text):
    text_clean = preprocess(text)
    vec = vectorizer.transform([text_clean])
    pred = model.predict(vec)
    return label_encoder.inverse_transform(pred)[0]

# Example Prediction
test_text = "I feel so tired and upset today"
print(f"\n💬 Input: {test_text}")
print("🎯 Predicted Emotion:", predict_emotion(test_text))


✅ Loaded 16000 samples

📊 Class Distribution After Filtering:
emotion
joy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,               242
joy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,              225
joy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,             217
joy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,           214
sadness,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,     213
                                                                      ... 
fear,,,,,,,,,,,,,,,,,,                                                   2
surprise,,,,,,,,,,,,,,,,,,                                               2
love,,,,,,,,,,,,,,,,,,,,                                                 2
love,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,      2
sadness,,,,,,,,                                                          2
Name: count, Length: 328, dtype: int64

✅ Sanity Check:
y_test classes: 308
y_pred classes: 1
label_encod

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
