In [2]:
import pandas as pd 

In [3]:
df=pd.read_csv('Emotion_classify_Data.csv')
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [4]:
df.shape

(5937, 2)

In [5]:
df.Emotion.value_counts()

Emotion
anger    2000
joy      2000
fear     1937
Name: count, dtype: int64

In [6]:
df['emotions_class']=df['Emotion'].map({
    'anger':0,
    'joy':1,
    'fear':2
})

In [7]:
df.head()

Unnamed: 0,Comment,Emotion,emotions_class
0,i seriously hate one subject to death but now ...,fear,2
1,im so full of life i feel appalled,anger,0
2,i sit here to write i start to dig out my feel...,fear,2
3,ive been really angry with r and i feel like a...,joy,1
4,i feel suspicious if there is no one outside l...,fear,2


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.Comment, 
    df.emotions_class, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.emotions_class
)

In [9]:
X_train.head()

5055    i feel like an innocent victim i feel that i j...
2662    i feel disgusted at him and at myself for havi...
1355    i still did not really feel like myself and i ...
4393    i think i wake up every day feeling terrified ...
3882    i thought this is precisely why i m making the...
Name: Comment, dtype: object

In [10]:
y_train.head()

5055    1
2662    0
1355    0
4393    2
3882    2
Name: emotions_class, dtype: int64

In [11]:
X_train.shape

(4749,)

In [12]:
y_train.shape

(4749,)

In [13]:
X_test.shape

(1188,)

In [14]:
X_test.head()

2743    i feel tortured and tragic enough as it is wit...
3003    i feel like i can take on the world and even i...
1617    i got my eyebrows waxed the other day and i fe...
4324    i think its kind of taken us this long to buil...
2488    i think people reject their feelings because t...
Name: Comment, dtype: object

In [15]:
y_test.shape

(1188,)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
clf=Pipeline([
    ('tf-idf vectorizer',TfidfVectorizer()),
    ('random forest',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       400
           1       0.92      0.93      0.92       400
           2       0.91      0.93      0.92       388

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [17]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bigrams', TfidfVectorizer()),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       400
           1       0.91      0.90      0.91       400
           2       0.91      0.87      0.89       388

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



now we will preprocess the data for increase the accruaccy

In [18]:
import spacy

In [19]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [20]:
df['processed_Comments']=df['Comment'].apply(preprocess)

In [21]:
df.head()

Unnamed: 0,Comment,Emotion,emotions_class,processed_Comments
0,i seriously hate one subject to death but now ...,fear,2,seriously hate subject death feel reluctant drop
1,im so full of life i feel appalled,anger,0,m life feel appalled
2,i sit here to write i start to dig out my feel...,fear,2,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,1,ve angry r feel like idiot trust place
4,i feel suspicious if there is no one outside l...,fear,2,feel suspicious outside like rapture happen


 with preprocessing


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.processed_Comments, 
    df.emotions_class, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.emotions_class
)

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
clf=Pipeline([
    ('tf-idf vectorizer',TfidfVectorizer()),
    ('random forest',RandomForestClassifier())
])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       400
           1       0.93      0.94      0.94       400
           2       0.92      0.93      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



best accuracy

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
clf = Pipeline([
    ('vectorizer_bi_grams', CountVectorizer(ngram_range = (1, 2))),                       #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       400
           1       0.94      0.95      0.95       400
           2       0.94      0.91      0.93       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [25]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_bigrams', TfidfVectorizer(ngram_range=(1,2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.94      0.92       400
           1       0.96      0.94      0.95       400
           2       0.92      0.91      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



In [26]:
from sklearn.naive_bayes import MultinomialNB
clf = Pipeline([
    ('vectorizer_bigrams', CountVectorizer(ngram_range=(1,2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93       400
           1       0.95      0.93      0.94       400
           2       0.91      0.93      0.92       388

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188

