In [1]:
!pip install -q pandas scikit-learn matplotlib nltk

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords

In [3]:
STOPWORDS = set(stopwords.words('english'))

In [10]:
import pandas as pd
import os
import glob

In [11]:
csv_files = glob.glob("**/*.csv", recursive=True)

if not csv_files:
    raise FileNotFoundError(
        "No CSV file found in this folder. Please upload your dataset first."
    )

print("Found the CSV files:")
for i, f in enumerate(csv_files):
    print(f"[{i}] {f}")

✅ Found these CSV files:
[0] ML\emotion_dataset.csv
[1] ML\hearing_test.csv
[2] ML\tips.csv


In [13]:
DATA_PATH = csv_files[0]
print(f"\nLoading dataset from: {DATA_PATH}")


Loading dataset from: ML\emotion_dataset.csv


In [14]:
df = pd.read_csv(DATA_PATH, encoding="utf-8", engine="python")

In [16]:
print("\nColumns detected:", df.columns.tolist())
print("\nFirst few rows:")
display(df.head())


Columns detected: ['Text', 'Emotion']

First few rows:


Unnamed: 0,Text,Emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [17]:
text_col = None
label_col = None

In [18]:
for col in df.columns:
    if col.lower() in ['text', 'sentence', 'content', 'message']:
        text_col = col
    elif col.lower() in ['label', 'emotion', 'sentiment', 'category']:
        label_col = col

if not text_col or not label_col:
    raise ValueError(
        f"Couldn't find text/label columns. Found columns: {df.columns.tolist()}\n"
        "Make sure your CSV has at least one text column and one label/emotion column."
    )


In [19]:
df = df[[text_col, label_col]].dropna().reset_index(drop=True)
df.columns = ['text', 'label'] 

In [20]:
print(f"\nDataset loaded successfully with {len(df)} rows.")
display(df.head())


Dataset loaded successfully with 2000 rows.


Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [22]:
X = df['text_clean']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

In [23]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [24]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

In [25]:
y_pred = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5925
              precision    recall  f1-score   support

       anger       1.00      0.15      0.25        55
        fear       0.58      0.16      0.25        45
         joy       0.55      0.96      0.70       139
        love       0.00      0.00      0.00        32
     sadness       0.64      0.77      0.70       116
    surprise       0.00      0.00      0.00        13

    accuracy                           0.59       400
   macro avg       0.46      0.34      0.32       400
weighted avg       0.58      0.59      0.51       400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
def predict_emotion(text):
    clean = preprocess_text(text)
    vec = vectorizer.transform([clean])
    return clf.predict(vec)[0]

In [47]:
import pandas as pd

In [48]:
X_test_texts = X_test
y_true = y_test
y_pred = clf.predict(X_test_tfidf)

In [49]:
mis_idx = [i for i, (t, p) in enumerate(zip(y_true, y_pred)) if p == 'joy' and t != 'joy']
print("Number of problematic predictions (predicted joy but not true joy):", len(mis_idx))

Number of problematic predictions (predicted joy but not true joy): 107


In [50]:
for i in mis_idx[:20]:
    print("---")
    print("text:", X_test_texts.iloc[i])
    print("true:", y_true.iloc[i])
    print("pred:", y_pred[i])

---
text: im sure theres nothing get person feeling amorous faster stay hotel
true: love
pred: joy
---
text: get people asking feels like hated man dallas county said assessor steve helm
true: sadness
pred: joy
---
text: needed space needed grow midst serious change ok yes also hurt feelings pretty badly bit spiteful
true: anger
pred: joy
---
text: watched news tv
true: anger
pred: joy
---
text: feel affectionate toward
true: love
pred: joy
---
text: feel like living austin really sweet ways
true: love
pred: joy
---
text: didint feel love caring
true: love
pred: joy
---
text: know feel guess one moments want feel like accepted even though whatever get mattered
true: love
pred: joy
---
text: made sex feel women enjoy sex body emotions admired respected
true: love
pred: joy
---
text: feel like wouldnt longing could baby new experience together
true: love
pred: joy
---
text: cannot explain need say please understand feeling heart im heartless person
true: anger
pred: joy
---
text: asthma 

In [51]:
import re
from nltk.corpus import stopwords
import nltk

In [52]:
nltk.download('stopwords', quiet=True)

True

In [53]:
NEGATIONS = {"not","no","never","n't"}
STOPWORDS = set(stopwords.words('english')) - NEGATIONS

In [54]:
_contractions = {
    "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
    "it's": "it is", "don't": "do not", "didn't": "did not", "can't": "cannot",
    "i've": "i have", "we're": "we are", "they're": "they are", "that's": "that is",
    "won't": "will not", "isn't":"is not", "aren't":"are not", "couldn't":"could not"
}

In [55]:
def expand_contractions(text):
    t = text.lower()
    for k,v in _contractions.items():
        t = t.replace(k, v)
    return t

In [56]:
def preprocess_text_safe(s):
    s = str(s)
    s = expand_contractions(s)
    s = re.sub(r'http\S+|www\.\S+|\S+@\S+', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)               
    tokens = s.split()
    tokens = [t for t in tokens if (t in NEGATIONS) or (t not in STOPWORDS and len(t) > 1)]
    return " ".join(tokens)

In [57]:
df['text_clean'] = df['text'].apply(preprocess_text_safe)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [59]:
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=3)
X = df['text_clean']
y = df['label']

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [61]:
clf = LogisticRegression(max_iter=2000, class_weight='balanced', solver='saga')  # saga handles larger feature sets
clf.fit(X_train_tfidf, y_train)



In [62]:
y_pred = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.63
              precision    recall  f1-score   support

       anger       0.73      0.64      0.68        55
        fear       0.34      0.87      0.49        45
         joy       0.81      0.65      0.72       139
        love       0.48      0.44      0.46        32
     sadness       0.77      0.61      0.68       116
    surprise       0.67      0.15      0.25        13

    accuracy                           0.63       400
   macro avg       0.63      0.56      0.55       400
weighted avg       0.70      0.63      0.64       400



In [64]:
import numpy as np

def show_top_tokens_for_class(clf, vectorizer, class_name, top_n=20):
    if hasattr(clf, "classes_"):
        idx = list(clf.classes_).index(class_name)
    else:
        raise ValueError("Model has no classes_ attribute")
    feature_names = np.array(vectorizer.get_feature_names_out())
    coefs = clf.coef_[idx]
    top_pos = np.argsort(coefs)[-top_n:][::-1]
    top_neg = np.argsort(coefs)[:top_n]
    print(f"Top tokens strongly favoring class '{class_name}':\n", feature_names[top_pos])
    print(f"\nTop tokens strongly disfavoring class '{class_name}':\n", feature_names[top_neg])

In [65]:
show_top_tokens_for_class(clf, vectorizer, 'joy', top_n=25)

Top tokens strongly favoring class 'joy':
 ['sincere' 'energetic' 'acceptable' 'worthwhile' 'wonderful' 'talented'
 'glad' 'brave' 'perfect' 'privileged' 'every' 'positive' 'creative'
 'relaxed' 'determined' 'important' 'happy' 'enough' 'productive'
 'excited' 'valued' 'joyful' 'proud' 'thrilled' 'eager']

Top tokens strongly disfavoring class 'joy':
 ['curious' 'overwhelmed' 'strange' 'bit' 'sometimes' 'shocked' 'agitated'
 'uncomfortable' 'weird' 'little' 'tender' 'words' 'gentle' 'funny' 'bad'
 'surprised' 'anxious' 'tortured' 'angry' 'hated' 'afraid' 'amazed'
 'think' 'going' 'sympathetic']


In [66]:
mask = df['text_clean'].str.contains(r'\bannoying\b') & (df['label']=='joy')
print("Suspicious rows:", df[mask].shape[0])
display(df[mask].head(20))

Suspicious rows: 0


Unnamed: 0,text,label,text_clean


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
for res in ["punkt", "stopwords", "wordnet", "omw-1.4"]:
    try:
        nltk.data.find(f"tokenizers/{res}")
    except LookupError:
        try:
            nltk.download(res, quiet=True)
        except Exception:
            pass 


try:
    stop_words = set(stopwords.words("english"))
except LookupError:
    stop_words = {"the","and","is","in","to","it","of","that","this","a","an","for","on","with","as","i","you"}

lemmatizer = WordNetLemmatizer()

def preprocess_text(s):
    s = str(s).lower()
    s = re.sub(r'http\S+|www\.\S+|\S+@\S+', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)               
    try:
        tokens = nltk.word_tokenize(s)
    except LookupError:
        tokens = s.split()

    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    try:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    except LookupError:
        pass

    return " ".join(tokens)


In [None]:
print(predict_emotion("i start to feel emotional"))
print(predict_emotion("i feel blessed to know this family"))
print(predict_emotion("i feel pissed off and angry"))

sadness
love
anger
