In [30]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
import string
import json
import os
import random
import sys
sys.path.append("../libraries/")
from selector import split_data

In [50]:
def readNRC(filename):
    """ Reads the NRC lexicon into a dictionary.
    """
    wordToEmotions = dict()
    p_stemmer = PorterStemmer()
    with open(filename, 'r') as fp:
        # Loop through lines
        for line in fp:
            line = line.strip('\n')
            # If no emotional value, skip
            if line[-1:] == '0':
                continue
            # Else analyze
            else:
                words = line.split('\t')
                if len(words) != 3:
                    continue
                # Stem word
                word = p_stemmer.stem(words[0])
#                 word = words[0]
                emotion = words[1]
                # Store the emotions associated with the word
                if word not in wordToEmotions.keys():
                    wordToEmotions[word] = [emotion]
                else:
                    wordToEmotions[word].append(emotion)
    return wordToEmotions

lexicon = readNRC("../data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
emotionList = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [3]:
json_dat, val_dat = split_data('../data/Sports_and_Outdoors_Reviews_training.json', 80)

In [28]:
sampleData = json_dat[:40000]

In [74]:
def hasEmotion(word, lexicon, emotion):
    if word not in lexicon.keys() or emotion not in lexicon[word]:
        return 0
    else:
        return 1

# Returns a list with percentage of words which conveyed [anger, anticipation, ... , trust]
def getEmotions(words, lexicon, emotionList):
    emotionCount = [0] * len(emotionList)
    emotionalWords = 0
    p_stemmer = PorterStemmer()
    for word in words:
        # Stem each word
        word = p_stemmer.stem(word)
        # List of emotions
        for i in range(len(emotionList)):
            emotionCount[i] += hasEmotion(word, lexicon, emotionList[i])
    
    # Avg over all words
    if len(words) > 0:
        emotionCount = [x/len(words) for x in emotionCount]
    
    return emotionCount

string1 = sampleData[7].get('reviewText')
print(getEmotions(gensim.utils.simple_preprocess(string1, deacc = True), lexicon, emotionList))

[0.0, 0.14285714285714285, 0.0, 0.0, 0.2857142857142857, 0.0, 0.2857142857142857, 0.0, 0.14285714285714285, 0.14285714285714285]


In [79]:
scores = []
emotions = []
for i in range(len(sampleData)):
    currJson = sample_data[i]
    if(not currJson.get('reviewText')):
        continue
    scores.append(currJson.get('overall'))
    reviewWords = gensim.utils.simple_preprocess(currJson.get('reviewText'), deacc=True)
    currEmotions = getEmotions(reviewWords, lexicon, emotionList)
    emotions.append(currEmotions)
    print('[%d%%]\r' % (100*i/len(sampleData)), end='')


[99%]