In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Emotion_classify_Data.csv")

In [3]:
df.shape

(5937, 2)

In [4]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [5]:
df.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [6]:
df['emotion_num'] = df.Emotion.map({"anger":0, "joy":1,
                                    "fear":2
                                   })

In [7]:
df.head()

Unnamed: 0,Comment,Emotion,emotion_num
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.Comment, df.emotion_num,
                                                   test_size=0.2, random_state=0,
                                                   stratify=df.emotion_num)

In [10]:
X_train.shape

(4749,)

In [13]:
X_test.shape

(1188,)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [15]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(3,3))),
    ('random_forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.34      0.42       400
           1       0.55      0.27      0.36       400
           2       0.39      0.76      0.52       388

    accuracy                           0.45      1188
   macro avg       0.50      0.46      0.43      1188
weighted avg       0.50      0.45      0.43      1188



In [18]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('multi_nb', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       400
           1       0.90      0.84      0.87       400
           2       0.85      0.83      0.84       388

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



In [19]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('random_forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       400
           1       0.86      0.96      0.91       400
           2       0.97      0.87      0.92       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [20]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('random_forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90       400
           1       0.89      0.95      0.92       400
           2       0.94      0.89      0.92       388

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [21]:
import spacy

In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
def preprocess(text):
    doc = nlp(text)
    filtered_text = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_text.append(token.lemma_)
    return " ".join(filtered_text)

In [24]:
df['preprocessed_comment'] = df.Comment.apply(preprocess)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed_comment, df.emotion_num,
                                                   test_size=0.2, random_state=0,
                                                   stratify=df.emotion_num)

In [26]:
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('random_forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       400
           1       0.94      0.95      0.94       400
           2       0.97      0.90      0.93       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [27]:
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('random_forest', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       400
           1       0.94      0.94      0.94       400
           2       0.95      0.94      0.94       388

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188

