In [1]:
import pandas as pd
df=pd.read_csv("combined_emotion.csv")
df

Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear
...,...,...
422741,i begun to feel distressed for you,fear
422742,i left feeling annoyed and angry thinking that...,anger
422743,i were to ever get married i d have everything...,joy
422744,i feel reluctant in applying there because i w...,fear


In [2]:
df.emotion.value_counts()

emotion
joy        143067
sad        121187
anger       59317
fear        49649
love        34554
suprise     14972
Name: count, dtype: int64

In [3]:
df = df.drop(df[df['emotion'] == 'suprise'].index)

In [4]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df["emotion_n"] = label.fit_transform(df["emotion"])

In [5]:
df.drop(["emotion"], axis="columns", inplace = True)

In [6]:
df

Unnamed: 0,sentence,emotion_n
0,i just feel really helpless and heavy hearted,1
1,ive enjoyed being able to slouch about relax a...,4
2,i gave up my internship with the dmrg and am f...,1
3,i dont know i feel so lost,4
4,i am a kindergarten teacher and i am thoroughl...,1
...,...,...
422741,i begun to feel distressed for you,1
422742,i left feeling annoyed and angry thinking that...,0
422743,i were to ever get married i d have everything...,2
422744,i feel reluctant in applying there because i w...,1


In [7]:
df_class_0 = df[df['emotion_n'] == 0]
df_class_1 = df[df['emotion_n'] == 1]
df_class_2 = df[df['emotion_n'] == 2]
df_class_3 = df[df['emotion_n'] == 3]
df_class_4 = df[df['emotion_n'] == 4]

In [8]:
df.emotion_n.value_counts()

emotion_n
2    143067
4    121187
0     59317
1     49649
3     34554
Name: count, dtype: int64

In [9]:
df_class_0_under = df_class_0.sample(34554)
df_class_1_under = df_class_1.sample(34554)
df_class_2_under = df_class_2.sample(34554)
df_class_4_under = df_class_4.sample(34554)

In [10]:
df_test_under = pd.concat([df_class_3,df_class_0_under, df_class_1_under,df_class_2_under,df_class_4_under], axis=0)
df_test_under.emotion_n.value_counts()

emotion_n
3    34554
0    34554
1    34554
2    34554
4    34554
Name: count, dtype: int64

In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    doc = nlp(text)

    no_stop_word = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            no_stop_word.append(token.lemma_)
    return " ".join(no_stop_word)
df["No_stop_word"] =   df["sentence"].apply(preprocess)

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train , y_test = train_test_split(df.No_stop_word, df.emotion_n , test_size= 0.2 , stratify= df.emotion_n )

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [15]:
emotion_analysis = Pipeline([
    ("vectorizer_tfidf", TfidfVectorizer()),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [16]:
emotion_analysis.fit(x_train, y_train)  

In [17]:
y_pred = emotion_analysis.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87     11863
           1       0.87      0.87      0.87      9930
           2       0.89      0.90      0.89     28614
           3       0.69      0.63      0.66      6911
           4       0.91      0.91      0.91     24237

    accuracy                           0.88     81555
   macro avg       0.85      0.84      0.84     81555
weighted avg       0.87      0.88      0.87     81555



In [18]:
emotion_analysis.score(x_test,y_test)

0.8750781681074121

In [20]:
import pickle
with open("emotion_analysis.pickle","wb") as f:
    pickle.dump(emotion_analysis,f)