In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from spellchecker import SpellChecker
from random import randint
import numpy as np
import string
import json
import os
import random
import sys
sys.path.append("../libraries/")
from selector import split_data

In [2]:
def readNRC(filename):
    """ Reads the NRC lexicon into a dictionary.
    """
    wordToEmotions = dict()
    p_stemmer = PorterStemmer()
    count = 0
    with open(filename, 'r') as fp:
        # Loop through lines
        for line in fp.readlines():
            line = line.strip('\n')
            words = line.split('\t')
            if len(words) != 3:
                continue
            # Stem word
            word = p_stemmer.stem(words[0])
            val = int(line[-1:])
            # Store the emotions associated with the word
            if count == 0:
                wordToEmotions[word] = np.array([val])
            else:
                wordToEmotions[word] = np.append(wordToEmotions[word],val)
                
            count = (count + 1)%10
    return wordToEmotions

lexicon = readNRC("../data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
emotionList = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [3]:
print(lexicon['angel'])

[0 0 0 0 1 0 1 0 0 1]


In [4]:
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [19]:
sampleData = json_dat[:10000]

In [40]:
p_stemmer = PorterStemmer()
sp = SpellChecker()

# Returns a list with percentage of words which conveyed [anger, anticipation, ... , trust]
def getEmotions(words, lexicon):
    emotionCount = np.array([0,0,0,0,0,0,0,0,0,0])
    emotionWords = 0
    for word in words:
#         print(f'{word}\r', end='')
        # Stem each word
        word = p_stemmer.stem(sp.correction(word))
        # Sum the emotions
        if word in lexicon.keys():
            emotionWords += 1
            emotionCount = emotionCount + lexicon[word]
        
    emotionCount[emotionCount == 0] = 0.1
    # Avg over all words
    if emotionWords > 0:
        emotionCount = emotionCount / emotionWords
    
    return emotionCount

string1 = sampleData[45].get('summary')
print(getEmotions(simple_preprocess(string1, deacc = True), lexicon))

[0. 0. 0. 0. 1. 0. 1. 0. 1. 1.]


In [41]:
import time
for i in range(30):
    ra = randint(0, len(sampleData))
    time0 = time.time()
    getEmotions(simple_preprocess(sampleData[ra].get('reviewText')), lexicon)
    time1 = time.time()
    print(f'Takes {time1-time0},',end='')

Takes 0.0020008087158203125,Takes 0.004990339279174805,Takes 0.007004261016845703,Takes 0.0009946823120117188,Takes 0.0,Takes 0.0010080337524414062,Takes 0.005991458892822266,Takes 0.0070040225982666016,Takes 0.0,Takes 0.006996870040893555,Takes 0.001999378204345703,Takes 0.001003265380859375,Takes 0.009999275207519531,Takes 0.0010046958923339844,Takes 0.01299905776977539,Takes 0.005001544952392578,Takes 0.0029990673065185547,Takes 0.002996683120727539,Takes 0.001004934310913086,Takes 0.0029990673065185547,Takes 0.000997781753540039,Takes 0.0020017623901367188,Takes 0.02099442481994629,Takes 0.0010027885437011719,Takes 0.004006624221801758,Takes 0.001993894577026367,Takes 0.010000944137573242,Takes 0.009007692337036133,Takes 1.3049616813659668,Takes 0.006002664566040039,

In [None]:
scores = []
emotions = []
for i in range(len(sampleData)):
    currJson = sampleData[i]
    if(not currJson.get('summary')):
        continue
    # Get score
    scores.append(currJson.get('overall'))
    # Get summary words and do analysis
    words = simple_preprocess(currJson.get('summary'), deacc=True)
    currEmotions = getEmotions(words, lexicon)
    emotions.append(currEmotions)
    
    print('[%d%%]\r' % (100*(i+1)/len(sampleData)), end='')


[18%]

In [30]:
train_lbls = np.array(scores) >= 4.5
np_emotions = np.array(emotions)

[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]
[array([0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ])]


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np_emotions, train_lbls, test_size=0.2)

In [33]:
from sklearn import svm

# Make the classifier
clf = svm.SVC(kernel='linear')

# Fit the model
clf.fit(X_train, y_train)

# Predict the testing set
y_pred = clf.predict(X_test)

In [37]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

In [38]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.6639579947493437
Precision: 0.687624183934147
Recall: 0.9086646661665416
