# Follow the below link to download the dataset
https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp

In [1]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

import os
import pandas as pd
import numpy as np
os.getcwd()
os.chdir('C:\\Users\\Abin\\Desktop\\Emotion_Prediction') ## Replace this with your working directory

In [2]:
df_train = pd.read_csv('train.txt', header =None, sep =';', names = ['Input','Sentiment'], encoding='utf-8')
df_test = pd.read_csv('test.txt', header = None, sep =';', names = ['Input','Sentiment'],encoding='utf-8')

In [3]:
df_train.tail()

Unnamed: 0,Input,Sentiment
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger
15999,i know a lot but i feel so stupid because i ca...,sadness


                                                         Contents

01. Data Sourcing, Data Cleaning and Sampling
02. Bag of Words Model
03. Cost Sensitive Learning and Evaluation Metrics
04. Building Base-Line Model (Logistic Regression) and save the model

                                           01. Data Sourcing, Data Cleaning and Sampling

In [4]:
df_train.Sentiment.value_counts() # Imbalanced Class

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: Sentiment, dtype: int64

In [5]:
df_train['format']='train'
df_test['format']='test'

total_df=pd.concat([df_train,df_test],axis=0)
total_df.shape

(18000, 3)

                                                 01. Data Cleaning
                                                  
                                                  a. Tokenization (Word and Regex)
                                                  b. Convert Tokens to Lower Case
                                                  b. Filter Out Punctuation
                                                  c. Filter out Stop Words (and Pipeline)
                                                  d. Stem Words

                                            a. Tokenization (word_tokenize) 

In [6]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Abin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df=total_df.copy()
df['tokenized_Tweet'] = df.Input.values
# Split each sentence into tokens
df['tokenized_Tweet']=df['tokenized_Tweet'].apply(lambda row: nltk.word_tokenize(row))
df['tokenized_Tweet'].head()

0                         [i, didnt, feel, humiliated]
1    [i, can, go, from, feeling, so, hopeless, to, ...
2    [im, grabbing, a, minute, to, post, i, feel, g...
3    [i, am, ever, feeling, nostalgic, about, the, ...
4                            [i, am, feeling, grouchy]
Name: tokenized_Tweet, dtype: object

In [8]:
df['tokenized_Tweet']=df['tokenized_Tweet'].astype('str')

                                           a. Tokenization (RegexpTokenizer) 

tokenizer that picks out sequences of alphanumeric characters as tokens and drops everything else: 
(example: azithromycin-induced, we can drop hypen here. )

In [9]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
df['tokenized_Tweet'] = df['tokenized_Tweet'].apply(lambda row: tokenizer.tokenize(row))
df['tokenized_Tweet'].head()

0                         [i, didnt, feel, humiliated]
1    [i, can, go, from, feeling, so, hopeless, to, ...
2    [im, grabbing, a, minute, to, post, i, feel, g...
3    [i, am, ever, feeling, nostalgic, about, the, ...
4                            [i, am, feeling, grouchy]
Name: tokenized_Tweet, dtype: object

                                               b. Convert Tokens to Lower Case

In [10]:
df['tokenized_Tweet'] = df['tokenized_Tweet'].apply(lambda row:[x.lower() for x in row])
df['tokenized_Tweet'].head()

0                         [i, didnt, feel, humiliated]
1    [i, can, go, from, feeling, so, hopeless, to, ...
2    [im, grabbing, a, minute, to, post, i, feel, g...
3    [i, am, ever, feeling, nostalgic, about, the, ...
4                            [i, am, feeling, grouchy]
Name: tokenized_Tweet, dtype: object

                                                 C. Filter Out Punctuation

In [11]:
df['token_filter']=df['tokenized_Tweet']
# remove all tokens that are not alphabetic
df['token_filter'] = df['token_filter'].apply(lambda row: [x for x in row if x.isalpha()])
df['token_filter'].head()

0                         [i, didnt, feel, humiliated]
1    [i, can, go, from, feeling, so, hopeless, to, ...
2    [im, grabbing, a, minute, to, post, i, feel, g...
3    [i, am, ever, feeling, nostalgic, about, the, ...
4                            [i, am, feeling, grouchy]
Name: token_filter, dtype: object

                                             d. Filter out Stop Words (and Pipeline)

In [12]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stop_words = set(stopwords.words('english'))
df['token_filter_Stop'] = df['token_filter'].apply(lambda row: [x for x in row if not x in stop_words])
df['token_filter_Stop'].head() # We, the, about etc words are removed from the sentences

0                            [didnt, feel, humiliated]
1    [go, feeling, hopeless, damned, hopeful, aroun...
2    [im, grabbing, minute, post, feel, greedy, wrong]
3    [ever, feeling, nostalgic, fireplace, know, st...
4                                   [feeling, grouchy]
Name: token_filter_Stop, dtype: object

                                                 e. Stem Words

In [14]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [15]:
df['token_filter_Stop_Stem'] = df['token_filter_Stop'].apply(lambda row: [porter.stem(word) for word in row])
df['token_filter_Stop_Stem'].head()

0                                [didnt, feel, humili]
1    [go, feel, hopeless, damn, hope, around, someo...
2         [im, grab, minut, post, feel, greedi, wrong]
3    [ever, feel, nostalg, fireplac, know, still, p...
4                                      [feel, grouchi]
Name: token_filter_Stop_Stem, dtype: object

                                              Sample to train and test set

In [16]:
data=df.copy() # Copying the original data frame

train_x=data.loc[data['format']=='train','token_filter_Stop_Stem']
train_x=train_x.astype('str')
y_train=data.loc[data['format']=='train','Sentiment']

test_x=data.loc[data['format']=='test','token_filter_Stop_Stem']
test_x=test_x.astype('str')
y_test=data.loc[data['format']=='test','Sentiment']


In [17]:
encoding = { 'anger': 0,
    'fear': 1,
    'joy': 2,
    'love': 3,
    'sadness': 4,
    'surprise': 5
}

train_y = [encoding[key] for key in y_train]
test_y = [encoding[key] for key in y_test]

                                                  02. Bag of Words Model
                              represent a whole sequence of words as a single feature vector. 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()   # Instantiating and creating Count Vector object
vectorizer.fit(train_x)
X_train = vectorizer.transform(train_x)
X_test  = vectorizer.transform(test_x)
X_train

<16000x10357 sparse matrix of type '<class 'numpy.int64'>'
	with 143599 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.externals import joblib
# Save the model for future purpose
joblib.dump(vectorizer,'vectorizer.pkl') ## helpful to tranform the future data

['vectorizer.pkl']

                                        03. Building Base-Line Model (Logistic Regression)

In [20]:
# Some required modules and libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import auc, make_scorer, recall_score, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [21]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight='balanced')
classifier.fit(X_train,train_y)
score = classifier.score(X_test,test_y)
score

0.855

In [22]:
from sklearn.externals import joblib
# Save the model for future purpose
joblib.dump(classifier,'Log_model.pkl')

['Log_model.pkl']