In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from spellchecker import SpellChecker
from random import randint
import numpy as np
import string
import json
import os
import random
import sys
sys.path.append("../libraries/")
from selector import split_data

In [2]:
def readNRC(filename):
    """ Reads the NRC lexicon into a dictionary.
    """
    wordToEmotions = dict()
    p_stemmer = PorterStemmer()
    count = 0
    with open(filename, 'r') as fp:
        # Loop through lines
        for line in fp.readlines():
            line = line.strip('\n')
            words = line.split('\t')
            if len(words) != 3:
                continue
            # Stem word
            word = p_stemmer.stem(words[0])
            val = int(line[-1:])
            # Store the emotions associated with the word
            if count == 0:
                wordToEmotions[word] = np.array([val])
            else:
                wordToEmotions[word] = np.append(wordToEmotions[word],val)
                
            count = (count + 1)%10
    return wordToEmotions

lexicon = readNRC("../data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
emotionList = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [45]:
list(zip(emotionList, lexicon['hate']))

[('anger', 1),
 ('anticipation', 0),
 ('disgust', 0),
 ('fear', 0),
 ('joy', 0),
 ('negative', 1),
 ('positive', 0),
 ('sadness', 0),
 ('surprise', 0),
 ('trust', 0)]

In [4]:
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [176]:
sampleData = json_dat[:4000]

In [91]:
bad = []
for i in sorted(range(len(sampleData)), reverse=True):
    if sampleData[i].get('summary').lower().find('star') > 0.0:
        sampleData.pop(i)

In [178]:
p_stemmer = PorterStemmer()
sp = SpellChecker()

# Returns a list with percentage of words which conveyed [anger, anticipation, ... , trust]
def getEmotions(words, lexicon):
    emotionCount = np.array([0,0,0,0,0,0,0,0,0,0])
    emotionWords = 0
    for word in words:
#         print(f'{word}\r', end='')
        # Stem each word
        word = p_stemmer.stem(sp.correction(word))
        # Sum the emotions
        if word in lexicon.keys():
            emotionWords += 1
            emotionCount = emotionCount + lexicon[word]
        
    emotionCount[emotionCount == 0] = 0.1
    # Avg over all words
    if emotionWords > 0:
        emotionCount = emotionCount / emotionWords
    
    return emotionCount

string1 = sampleData[45].get('reviewText')
print(getEmotions(simple_preprocess(string1, deacc = True), lexicon))

[0.  0.  0.  0.  0.  0.2 0.1 0.1 0.  0. ]


In [184]:
i = i+1
string1 = sampleData[i].get('reviewText')
em_vec = getEmotions(simple_preprocess(string1, deacc = True), lexicon)
print(string1)
list(zip(emotionList, em_vec))

I have both a Baltimore Ravens and now a Baltimore Orioles lanyard. I use mine for work to hold my ID card. While most just use the standard rope, I like to support my teams any way I can and look cool at it. The lanyard is a good size for an ID badge holder and very well made. The print on it is very sharp and clear. I would recommend this if you are a die hard Orioles fan that works in an office - it is just really cool to show off.


[('anger', 0.06896551724137931),
 ('anticipation', 0.0),
 ('disgust', 0.034482758620689655),
 ('fear', 0.06896551724137931),
 ('joy', 0.034482758620689655),
 ('negative', 0.10344827586206896),
 ('positive', 0.27586206896551724),
 ('sadness', 0.06896551724137931),
 ('surprise', 0.0),
 ('trust', 0.13793103448275862)]

In [21]:
import time
for i in range(30):
    ra = randint(0, len(sampleData))
    time0 = time.time()
    getEmotions(simple_preprocess(sampleData[ra].get('reviewText')), lexicon)
    time1 = time.time()
    print(f'Takes {time1-time0},',end='')

Takes 0.0033037662506103516,Takes 0.0007808208465576172,Takes 0.0003094673156738281,Takes 0.0062711238861083984,Takes 0.000583648681640625,Takes 0.00033926963806152344,Takes 0.0011785030364990234,Takes 0.0018792152404785156,Takes 0.0017588138580322266,Takes 0.4016759395599365,Takes 0.00020933151245117188,Takes 0.0007688999176025391,Takes 0.0011649131774902344,Takes 0.0007798671722412109,Takes 0.0001735687255859375,Takes 0.0012097358703613281,Takes 0.0009903907775878906,Takes 0.0014233589172363281,Takes 0.0008344650268554688,Takes 0.0012199878692626953,Takes 0.0008778572082519531,Takes 0.00012731552124023438,Takes 0.00012350082397460938,Takes 4.352596044540405,Takes 0.6008195877075195,Takes 0.0008301734924316406,Takes 0.00012969970703125,Takes 0.0012807846069335938,Takes 0.0010068416595458984,Takes 1.3344709873199463,

In [185]:
scores = []
emotions = []
for i in range(len(sampleData)):
    currJson = sampleData[i]
    if(not currJson.get('reviewText')):
        continue
    # Get score
    scores.append(currJson.get('overall'))
    # Get summary words and do analysis
    words, pos = simple_preprocess(currJson.get('reviewText'), deacc=True)
    currEmotions = getEmotions(words, lexicon)
    emotions.append(currEmotions)
    
    print('[%d%%]\r' % (100*(i+1)/len(sampleData)), end='')


[100%]

In [189]:
np.shape(scores)

(3999,)

In [190]:
train_lbls = np.array(scores) >= 4.5
np_emotions = np.array(emotions)

In [191]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC, SVC

clf = LinearSVC()
pred_score = cross_val_score(clf, np_emotions, train_lbls)

In [193]:
pred_score

array([0.6825   , 0.68875  , 0.685    , 0.68625  , 0.6795995])

In [188]:
list(zip(emotionList, [np.sum(np_emotions[:,i] > 0)/len(train_lbls) for i in range(0,10)]))
np.sum((np_emotions[:,6]>=0.4) == train_lbls)/len(train_lbls)

0.45586396599149787

In [33]:
from sklearn import svm

# Make the classifier
clf = svm.SVC(kernel='linear')

# Fit the model
clf.fit(X_train, y_train)

# Predict the testing set
y_pred = clf.predict(X_test)

In [37]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

In [38]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.6639579947493437
Precision: 0.687624183934147
Recall: 0.9086646661665416
