In [1]:
# import libraries
import pandas as pd
import numpy as np
import json
import os
import seaborn as sns
import matplotlib.pyplot as plt



#For Preprocessing
import re   
import nltk 
# nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
# read dataset into df folder
df = pd.read_csv("emotion_final.csv")

In [3]:
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21459 entries, 0 to 21458
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Text     21459 non-null  object
 1   Emotion  21459 non-null  object
dtypes: object(2)
memory usage: 335.4+ KB


In [5]:
df.shape

(21459, 2)

In [6]:
# process the tweets
# clearning the text
def clean_and_process(text):
    """
    This function removes any stopwords in the text
    It also removes punctuations
    and lastly it lemmatixe the word into its root form
    """
    
    # convert to lowercase
    text = text.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # tokenize the text into words
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords]
    # apply stemming
    #words = [PorterStemmer().stem(w) for w in words]
    words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    # return a new sentense with the applied functions
    return " ".join(words)

In [7]:
# some global vars
# lematizer
wordnet_lemmatizer = WordNetLemmatizer()
# remove stopwords from the text column
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
# check sample
print("Original text -->  ", df['Text'][18])
print("\nProcessed text -->  ", clean_and_process(df['Text'][18]))

Original text -->   i started feeling sentimental about dolls i had as a child and so began a collection of vintage barbie dolls from the sixties

Processed text -->   started feeling sentimental doll child began collection vintage barbie doll sixty


In [9]:
# apply the function to the create a new cleaned column
df['cleaned'] = df['Text'].apply(clean_and_process)

In [10]:
df.head()

Unnamed: 0,Text,Emotion,cleaned
0,i didnt feel humiliated,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,feeling grouchy


## Preparation for Model Training.


- We need to create a text vectorizer.
- We must use the whole dataset to create the vectorizer for similarity purpose.
- After fitting the vectorizer , transfroming will be done as per datase

## We will use both TFIDF and COUNTVECTORIZER for vectorization


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# define vectorizers
# count vectorizer
countVect = CountVectorizer() 
# frequency vectorizer.
tfidfVect = TfidfVectorizer()

# fit count vectorizer
countVect.fit(df['cleaned'])
# fit tdifvector
tfidfVect.fit(df['cleaned'])

TfidfVectorizer()

In [12]:
# shape
df.shape


(21459, 3)

In [13]:
from sklearn.model_selection import train_test_split


In [14]:
# get the y/label/outcome column

y = df['Emotion']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], y, test_size=.1)

In [16]:
X_train.shape, y_train.shape , X_test.shape , y_test.shape

((19313,), (19313,), (2146,), (2146,))

In [17]:
# here we are using the object for countvectorizer and tdfif vectorizers fitted above to transform the text to vectors
# transform the training to vects
X_train_count = countVect.transform(X_train)
X_train_tdfif = tfidfVect.transform(X_train)


# transform the testing data
X_test_tdfif = tfidfVect.transform(X_test)
X_test_count = countVect.transform(X_test)

## OUR Sentiments are in textual/categorical form
- We need to turn them into numerical

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
lbl_object = LabelEncoder()

# encode for only validation and train datasets
lbl_object.fit(y)

# encode the labels
Ytrain = lbl_object.transform(y_train)
Ytest = lbl_object.transform(y_test)

## Modelling


In [29]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score , accuracy_score , precision_score , recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [21]:
def get_basic_scores(Ytrue , Ypred):
    print("Accuracy Score {:.4f}".format(accuracy_score(Ytrue , Ypred)))
    print("F1 Score {:.4f}".format(f1_score(Ytrue , Ypred , average='weighted')))
    print("Precision Score {:.4f}".format(precision_score(Ytrue , Ypred , average='weighted')))
    print("Recall Score {:.4f}".format(recall_score(Ytrue , Ypred , average='weighted')))

    
    # create a function to draw confusion matrix and showing the model statistics
    # confussion matrix plot
def show_model_stats(model_name , Model , x_test , y_test):
    # get prdictions
    print(f"\t\t********** {Model}  Evaluation **********\n")
    from sklearn.metrics import accuracy_score , confusion_matrix , classification_report , f1_score
    prediction = model_name.predict(x_test)
    print(f"Accuracy is   {accuracy_score(y_test , prediction)*100}%")
    print(f"F1 Score is   {f1_score(y_test , prediction , average='weighted')*100}%\n")
    # classifciation reports
    print(classification_report(y_test, prediction))
    print("\n")
    c_matrix = confusion_matrix(y_test, prediction)
     # plot confusion matrix for better view
    plt.figure(figsize=(5,5))
    sns.heatmap(c_matrix , annot= True ,fmt="" ,  annot_kws={"size": 10})
    plt.xlabel("Actual Label")
    plt.ylabel("Predicted Label")
    plt.title(f"Confusion Matrix Plot for {Model} classifier")

## 1. BaseLine Model Logisticregression


In [22]:
# define logisticregression models for both tdfif and count vectorizers
logit_tdif = LogisticRegression(n_jobs =-1)
logit_count = LogisticRegression(n_jobs =-1)


# train the models
# train and get score for each.
logit_count.fit(X_train_count , Ytrain)
logic_pred1 = logit_count.predict(X_test_count)
print("******COUNT Vectorizer on Logistic regression Statistics*********")
get_basic_scores(logic_pred1 , Ytest)

print("\n\n******TFIDF Vectorizer on Logistic regression Statistics*********")
logit_tdif.fit(X_train_tdfif , Ytrain)
logic_pred2 = logit_tdif.predict(X_test_tdfif)
get_basic_scores(logic_pred2 , Ytest)

******COUNT Vectorizer on Logistic regression Statistics*********
Accuracy Score 0.8826
F1 Score 0.8833
Precision Score 0.8852
Recall Score 0.8826


******TFIDF Vectorizer on Logistic regression Statistics*********
Accuracy Score 0.8611
F1 Score 0.8659
Precision Score 0.8789
Recall Score 0.8611


## 2. MultinomialNB (a naive bayes model)


In [23]:
# try out multinomial model
nb_count = MultinomialNB()
nb_tdif = MultinomialNB()
nb_count.fit(X_train_count , Ytrain)

nbpred1 = nb_count.predict(X_test_count)
print("******COUNT Vectorizer on MultinomialNB Statistics*********")
get_basic_scores(nbpred1 , Ytest)
print("\n\n******TFIDF Vectorizer on MultinomialNB Statistics*********")
nb_tdif.fit(X_train_tdfif , Ytrain)
nbpred2 = nb_tdif.predict(X_test_tdfif)
get_basic_scores(nbpred2 , Ytest)

******COUNT Vectorizer on MultinomialNB Statistics*********
Accuracy Score 0.7856
F1 Score 0.8038
Precision Score 0.8448
Recall Score 0.7856


******TFIDF Vectorizer on MultinomialNB Statistics*********
Accuracy Score 0.6789
F1 Score 0.7499
Precision Score 0.9030
Recall Score 0.6789


## 3.Decision tree model

In [24]:
dec_count = DecisionTreeClassifier()
dec_tdif = DecisionTreeClassifier()


dec_count.fit(X_train_count , Ytrain)
dec_pred1 = dec_count.predict(X_test_count)
print("******COUNT Vectorizer on Decision tree Classier Statistics*********")
get_basic_scores(dec_pred1 , Ytest)

print("\n\n******TFIDF Vectorizer on Decision tree Classifier Statistics*********")
dec_tdif.fit(X_train_tdfif , Ytrain)
dec_pred2 = dec_tdif.predict(X_test_tdfif)
get_basic_scores(dec_pred2 , Ytest)

******COUNT Vectorizer on Decision tree Classier Statistics*********
Accuracy Score 0.8709
F1 Score 0.8704
Precision Score 0.8713
Recall Score 0.8709


******TFIDF Vectorizer on Decision tree Classifier Statistics*********
Accuracy Score 0.8630
F1 Score 0.8624
Precision Score 0.8633
Recall Score 0.8630


## 4. Random Forest Classifier

In [25]:
random_tfidf = RandomForestClassifier(n_estimators=10)
random_count = RandomForestClassifier(n_estimators =10)

random_count.fit(X_train_count , Ytrain)
random_pred1 = random_count.predict(X_test_count)
print("******COUNT Vectorizer on Random forest Classier Statistics*********")
get_basic_scores(random_pred1 , Ytest)

print("\n\n******TFIDF Vectorizer on Random forest Statistics*********")
random_tfidf.fit(X_train_tdfif , Ytrain)
random_pred2 = random_tfidf.predict(X_test_tdfif)
get_basic_scores(random_pred2 , Ytest)


******COUNT Vectorizer on Random forest Classier Statistics*********
Accuracy Score 0.8621
F1 Score 0.8620
Precision Score 0.8641
Recall Score 0.8621


******TFIDF Vectorizer on Random forest Statistics*********
Accuracy Score 0.8565
F1 Score 0.8571
Precision Score 0.8598
Recall Score 0.8565


## 5. Support Vector Machine

In [38]:
svm_tdif = SVC()
svm_count = SVC()

svm_count.fit(X_train_count , Ytrain)
svm_pred1 = svm_count.predict(X_test_count)
print("******** Count Vectorizer on SVM***********")
get_basic_scores(svm_pred1 , Ytest)

svm_tdif.fit(X_train_tdfif , Ytrain)
svm_pred2 = svm_tdif.predict(X_test_tdfif)

print("******** TFIDF Vectorizer on SVM***********")
get_basic_scores(svm_pred2 , Ytest)


******** Count Vectorizer on SVM***********
Accuracy Score 0.8276
F1 Score 0.8339
Precision Score 0.8545
Recall Score 0.8276
******** TFIDF Vectorizer on SVM***********
Accuracy Score 0.8518
F1 Score 0.8564
Precision Score 0.8700
Recall Score 0.8518


In [47]:
# Adaboost classifier



ada_tfidf = AdaBoostClassifier()
ada_count = AdaBoostClassifier()

ada_count.fit(X_train_count , Ytrain)
ada_pred1 = ada_count.predict(X_test_count)
print("******COUNT Vectorizer on AdaBoost Classifier*********")
get_basic_scores(ada_pred1 , Ytest)

print("\n\n******TFIDF Vectorizer on AdaBoost Classifier*********")
ada_tfidf.fit(X_train_tdfif , Ytrain)
ada_pred2 = ada_tfidf.predict(X_test_tdfif)
get_basic_scores(ada_pred2 , Ytest)


******COUNT Vectorizer on AdaBoost Classifier*********
Accuracy Score 0.3551
F1 Score 0.4623
Precision Score 0.8190
Recall Score 0.3551


******TFIDF Vectorizer on AdaBoost Classifier*********
Accuracy Score 0.3523
F1 Score 0.4589
Precision Score 0.8141
Recall Score 0.3523
