In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

os.chdir('/content/drive/MyDrive/Chatbot/classifier_model/')

### Importing the libraries

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("emotion-emotion_69k - emotion-emotion_69k.csv")

In [5]:
#print first 5 rows
df.head(2)

Unnamed: 0.1,Unnamed: 0,Situation,emotion,empathetic_dialogues,labels,Unnamed: 5,Unnamed: 6
0,0,I remember going to the fireworks with my best...,sentimental,Customer :I remember going to see the firework...,"Was this a friend you were in love with, or ju...",,
1,1,I remember going to the fireworks with my best...,sentimental,Customer :This was a best friend. I miss her.\...,Where has she gone?,,


In [6]:
#print the shape of the data set
print(df.shape)

(64636, 7)


In [7]:
df = df[['Situation', 'emotion']]

In [8]:
df.isnull().sum()

Situation    0
emotion      5
dtype: int64

In [9]:
df = df.dropna()

In [10]:
df.isnull().sum()

Situation    0
emotion      0
dtype: int64

In [11]:
df.duplicated().sum()

45333

In [12]:
df = df.drop_duplicates()

In [13]:
#check if the data is balanced or not
df.emotion.value_counts()

emotion
surprised       1000
excited          741
angry            686
proud            669
sad              663
annoyed          662
grateful         638
lonely           633
afraid           626
impressed        617
terrified        617
hopeful          616
guilty           613
disgusted        613
anxious          612
confident        611
furious          599
anticipating     598
joyful           598
nostalgic        596
disappointed     595
prepared         589
jealous          579
content          570
embarrassed      558
devastated       557
sentimental      515
caring           503
trusting         499
ashamed          490
apprehensive     463
faithful         372
Name: count, dtype: int64

### Text Preprocessing

In [14]:
df.head()

Unnamed: 0,Situation,emotion
0,I remember going to the fireworks with my best...,sentimental
5,i used to scare for darkness,afraid
10,I showed a guy how to run a good bead in weldi...,proud
14,I have always been loyal to my wife.,faithful
17,A recent job interview that I had made me feel...,terrified


In [15]:
df.reset_index(drop=True, inplace = True)

In [16]:
df.shape

(19298, 2)

In [17]:
df.head()

Unnamed: 0,Situation,emotion
0,I remember going to the fireworks with my best...,sentimental
1,i used to scare for darkness,afraid
2,I showed a guy how to run a good bead in weldi...,proud
3,I have always been loyal to my wife.,faithful
4,A recent job interview that I had made me feel...,terrified


In [23]:
import gensim
import string
import spacy

In [24]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'’re', "'s", 'keep', 'over', 'everything', 'besides', 'or', 'us', 'seem', 'himself', 'therein', 'eight', 'my', 'somewhere', 'former', 'whenever', 'last', 'whence', 'may', 'before', 'ca', 'doing', 'hereupon', 'becomes', 'how', 're', 'behind', 'can', 'meanwhile', 'name', 'neither', 'although', 'serious', 'anyone', 'anywhere', 'about', 'she', 'anything', 'with', 'latterly', 'front', 'wherever', 'their', 'every', 'do', 'other', 'many', 'thereupon', 'as', 'whereby', 'always', 'against', 'he', 'by', 'indeed', 'same', 'move', 'them', 'whatever', 'hereafter', 'amongst', 'get', 'seems', 'under', 'a', 'be', 'am', 'without', 'it', 'anyhow', 'thereby', 'when', 'go', 'three', 'cannot', 'each', 'mostly', 'on', 'twenty', 'does', 'they', 'will', '‘s', 'and', 'yourself', 'whose', 'thru', 'fifty', 'because', 'across', 'first', 'next', 'beside', 'amount', 'thence', 'herself', 'did', 'even', 'nor', 'n’t', 'should', 'why', 'since', 'now', 'various', 'upon', 'our', 'whither', 'namely', '‘ve', 'sometime', '

In [25]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [26]:
# Creating our tokenizer function

def spacy_tokenizer(sentence):

    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [27]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [28]:
wv = api.load('word2vec-google-news-300')



In [29]:
wv.save('word2vec-google-news-300.kv')

In [30]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [31]:
df['tokens'] = df['Situation'].apply(spacy_tokenizer)

In [32]:
df.head(2)

Unnamed: 0,Situation,emotion,tokens
0,I remember going to the fireworks with my best...,26,"[remember, firework, good, friend, lot, people..."
1,i used to scare for darkness,8,"[use, scare, darkness]"


In [33]:
df['vec'] = df['tokens'].apply(sent_vec)

In [None]:
df.head(2)

In [18]:
df['emotion'].unique()

array(['sentimental', 'afraid', 'proud', 'faithful', 'terrified',
       'joyful', 'angry', 'sad', 'jealous', 'grateful', 'prepared',
       'embarrassed', 'excited', 'annoyed', 'lonely', 'ashamed', 'guilty',
       'surprised', 'nostalgic', 'confident', 'furious', 'disappointed',
       'caring', 'trusting', 'disgusted', 'anticipating', 'anxious',
       'hopeful', 'content', 'impressed', 'apprehensive', 'devastated'],
      dtype=object)

In [19]:
df['emotion'].nunique()

32

In [20]:
emotion_replacements = {
    'surprised': 0,'excited': 1,'angry':2,'proud':3,'sad':4,'annoyed':5,'grateful':6,'lonely':7,'afraid':8,'hopeful':9,
    'terrified':10,'anxious':11,'guilty':12,'impressed':13,'disgusted':14,'confident':15,'anticipating':16,'joyful':17,
    'nostalgic':18,'furious':19,'disappointed':20,'prepared':21,'jealous':22,'content':23,'embarrassed':24,'devastated':25,
    'sentimental':26,'caring':27,'trusting':28,'ashamed':29,'apprehensive':30,'faithful':31}

# Replace emotions using the dictionary
df['emotion'] = df['emotion'].replace(emotion_replacements)

In [21]:
df['emotion'].unique()

array([26,  8,  3, 31, 10, 17,  2,  4, 22,  6, 21, 24,  1,  5,  7, 29, 12,
        0, 18, 15, 19, 20, 27, 28, 14, 16, 11,  9, 23, 13, 30, 25])

In [34]:
# Splitting the data into independent and dependent

x = df['vec'].to_list()
y = df['emotion'].to_list()

In [35]:
# Check the indices
print(df.index)

RangeIndex(start=0, stop=19298, step=1)


In [36]:
from imblearn.over_sampling import SMOTE,RandomOverSampler
resampled_X,resampled_Y=RandomOverSampler().fit_resample(x, y)

In [37]:
resampled_X

Output hidden; open in https://colab.research.google.com to view.

In [39]:
resampled_Y

[26,
 8,
 3,
 31,
 10,
 17,
 2,
 4,
 22,
 6,
 4,
 21,
 24,
 2,
 8,
 1,
 5,
 2,
 7,
 3,
 26,
 17,
 29,
 12,
 0,
 18,
 15,
 5,
 12,
 19,
 12,
 10,
 19,
 20,
 31,
 27,
 4,
 20,
 6,
 26,
 1,
 1,
 27,
 8,
 5,
 28,
 12,
 14,
 19,
 2,
 4,
 16,
 11,
 18,
 0,
 3,
 4,
 17,
 8,
 2,
 11,
 14,
 9,
 26,
 19,
 22,
 15,
 23,
 3,
 13,
 19,
 21,
 28,
 8,
 0,
 1,
 19,
 11,
 2,
 20,
 1,
 24,
 10,
 9,
 9,
 5,
 27,
 5,
 19,
 8,
 9,
 21,
 6,
 10,
 1,
 29,
 30,
 0,
 12,
 7,
 7,
 14,
 4,
 3,
 19,
 29,
 15,
 4,
 12,
 14,
 5,
 11,
 0,
 20,
 8,
 18,
 6,
 25,
 17,
 21,
 13,
 4,
 31,
 8,
 7,
 12,
 4,
 11,
 15,
 8,
 3,
 6,
 16,
 9,
 4,
 0,
 4,
 31,
 22,
 4,
 17,
 9,
 1,
 27,
 7,
 24,
 16,
 18,
 0,
 3,
 0,
 10,
 22,
 27,
 26,
 19,
 18,
 8,
 8,
 3,
 4,
 27,
 4,
 1,
 1,
 8,
 13,
 13,
 7,
 8,
 21,
 4,
 8,
 13,
 0,
 5,
 8,
 10,
 0,
 29,
 26,
 6,
 24,
 10,
 21,
 6,
 2,
 9,
 5,
 17,
 11,
 15,
 3,
 0,
 15,
 9,
 28,
 12,
 29,
 1,
 26,
 19,
 14,
 24,
 6,
 24,
 9,
 18,
 27,
 20,
 17,
 23,
 21,
 1,
 20,
 18,
 11,
 22,
 3,
 0,
 

In [40]:
# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(resampled_X, resampled_Y, test_size=0.2, random_state=42)

In [43]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()

In [44]:
classifier.fit(X_train, y_train)

In [45]:
y_pred = classifier.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [47]:
Accuracy = accuracy_score(y_test, y_pred)
print(Accuracy)

0.7171875


In [48]:
Precision = precision_score(y_test, y_pred, average = 'weighted')
print(Precision)

0.7198883202541586


In [49]:
Recall = recall_score(y_test, y_pred, average = 'micro')
print(Recall)

0.7171875


In [50]:
F1 = f1_score(y_test, y_pred, average = 'macro')
print(F1)

0.7155128924232221
