In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from spellchecker import SpellChecker
from random import randint
import numpy as np
import string
import json
import os
import random
import sys
sys.path.append("../libraries/")
from selector import split_data

In [17]:
# Read the NRC lexicon into a dictionary
def readNRC(filename):
    """ Reads the NRC lexicon into a dictionary.
    """
    wordToEmotions = dict()
    p_stemmer = PorterStemmer()
    count = 0
    with open(filename, 'r') as fp:
        # Loop through lines
        for line in fp.readlines():
            line = line.strip('\n')
            words = line.split('\t')
            if len(words) != 3:
                continue
            # Stem word
            word = p_stemmer.stem(words[0])
#             word = words[0]
            val = int(line[-1:])
            # Store the emotions associated with the word
            if count == 0:
                wordToEmotions[word] = np.array([val])
            else:
                wordToEmotions[word] = np.append(wordToEmotions[word],val)
                
            count = (count + 1)%10
    return wordToEmotions

lexicon = readNRC("../data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
emotionList = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [10]:
# Test code
p_stemmer = PorterStemmer()
list(zip(emotionList, lexicon[p_stemmer.stem('amazement')]))

[('anger', 0),
 ('anticipation', 0),
 ('disgust', 0),
 ('fear', 0),
 ('joy', 0),
 ('negative', 0),
 ('positive', 0),
 ('sadness', 0),
 ('surprise', 0),
 ('trust', 0)]

In [4]:
# Read in the data
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [18]:
p_stemmer = PorterStemmer()
sp = SpellChecker()

# Returns a list with percentage of words which conveyed [anger, anticipation, ... , trust]
def getEmotions(text, lexicon):
    emotionCount = np.array([0,0,0,0,0,0,0,0,0,0])
    emotionWords = 0
    for word in simple_preprocess(text,deacc=True):
        # Stem each word
        word = p_stemmer.stem(word)
        # Sum the emotions
        if word in lexicon.keys():
            emotionWords += 1
            emotionCount = emotionCount + lexicon[word]
        
    # Avg over all words
    if emotionWords > 0:
        emotionCount = emotionCount / emotionWords
    
    return emotionCount

In [39]:
# Get the sample data
num = 10000
sampleData = []
for i in range(num):
    index = randint(0,len(json_dat))
    sampleData.append(json_dat[index])

# Create a vector for each data point
scores = []
emotions = []
for i in range(len(sampleData)):    
    print('[%d%%]\r' % (100*(i+1)/len(sampleData)), end='')
    currJson = sampleData[i]
    if(not currJson.get('reviewText')):
        continue
    # Get score
    scores.append(currJson.get('overall'))
    # Get summary words and do analysis
    currEmotions = getEmotions(currJson.get('reviewText'), lexicon)
    emotions.append(currEmotions)

[100%]

In [43]:
from sklearn.model_selection import train_test_split

labels = np.array(scores) >= 4.5
np_emotions = np.array(emotions)
X_train, X_test, y_train, y_test = train_test_split(np_emotions, labels, test_size=0.2)

In [47]:
from sklearn import svm
from sklearn import metrics

# Make the classifier
clf = svm.SVC(kernel='rbf')
# np.random.shuffle(y_train)
# Fit the model
clf.fit(X_train, y_train)

# Predict the testing set
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))
# Model F1-score
print("F1-score:",metrics.f1_score(y_test, y_pred))

# Report
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.677
Precision: 0.6786437246963563
Recall: 0.9918639053254438
F1-score: 0.8058894230769231
              precision    recall  f1-score   support

       False       0.54      0.02      0.04       648
        True       0.68      0.99      0.81      1352

    accuracy                           0.68      2000
   macro avg       0.61      0.51      0.42      2000
weighted avg       0.63      0.68      0.56      2000



In [36]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=8)
neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))
# Model F1-score
print("F1-score:",metrics.f1_score(y_test, y_pred))

# Report
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.604
Precision: 0.6718946047678795
Recall: 0.7992537313432836
F1-score: 0.7300613496932515
              precision    recall  f1-score   support

       False       0.34      0.21      0.26       660
        True       0.67      0.80      0.73      1340

    accuracy                           0.60      2000
   macro avg       0.50      0.50      0.49      2000
weighted avg       0.56      0.60      0.57      2000

