In [1]:
import pandas as pd
import numpy as np

from textblob import TextBlob
import sys
sys.path.append('scripts/')
from utility import *
import spacy
nlp = spacy.load("en_core_web_sm") # English Model

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

%matplotlib inline

RANDOM_STATE = 42

import warnings
warnings.filterwarnings('ignore')

In [9]:
data = pd.read_csv('data/Skyrim_Reviews.csv', usecols=['review', 'rating'])
data.head()

Unnamed: 0,rating,review
0,Recommended,"This is the game the never, ever ends. I picke..."
1,Recommended,Ruined my life. Five stars.
2,Recommended,I was stacking books on a shelf in my house in...
3,Recommended,"Best game I ever bought. In this game, you'll ..."
4,Recommended,Playing Skyrim is like masturbating. Feels goo...


In [7]:
# Opinion words
neg_file = open("data/negative-words.txt")
pos_file = open("data/positive-words.txt")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]

opinion_words = pos + neg

In [10]:
def custom_sentencizer(doc):
    
    boundary = re.compile('\.{2,}')
    digits = re.compile('^[0-9]*$')
    
    for token in doc[:-1]:
        if boundary.match(token.text) or digits.match(token.text) or token.text in [',', '!', '?', ':', 'and']:
            doc[token.i+1].is_sent_start = False
                
    return doc

In [11]:
nlp.add_pipe(custom_sentencizer, before="parser")

In [31]:
def feature_sentiment(text):
    '''
    input: dictionary and sentence
    function: appends dictionary with new features if the feature did not exist previously,
              then updates sentiment to each of the new or existing features
    output: updated dictionary
    '''

    sent_dict = {}#Counter()
    for sentence in sentence_tokenize(text):
        sentence_tokens = nlp(sentence)
        debug = 0
        for token in sentence_tokens:
            print(token)
        #    print(token.text,token.dep_, token.head, token.head.dep_)
            # check if the word is an opinion word, then assign sentiment
            if token.text in opinion_words:
                sentiment = 1 if token.text in pos else -1
                # if target is an adverb modifier (i.e. pretty, highly, etc.)
                # but happens to be an opinion word, ignore and pass
                if (token.dep_ == "advmod"):
                    continue
                elif (token.dep_ == "amod"):
                    sent_dict[token.head.text] += sentiment
                # for opinion words that are adjectives, adverbs, verbs...
                else:
                    for child in token.children:
                        # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                        # This could be better updated for modifiers that either positively or negatively emphasize
                        if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                            sentiment *= 1.5
                        # check for negation words and flip the sign of sentiment
                        if child.dep_ == "neg":
                            sentiment *= -1
                    for child in token.children:
                        # if verb, check if there's a direct object
                        if (token.pos_ == "VERB") & (child.dep_ == "dobj"):                        
                            sent_dict[child.text] += sentiment
                            # check for conjugates (a AND b), then add both to dictionary
                            subchildren = []
                            conj = 0
                            for subchild in child.children:
                                if subchild.text == "and":
                                    conj=1
                                if (conj == 1) and (subchild.text != "and"):
                                    subchildren.append(subchild.text)
                                    conj = 0
                            for subchild in subchildren:
                                sent_dict[subchild] += sentiment
    
                    # check for negation
                    for child in token.head.children:
                        noun = ""
                        if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                            sentiment *= 1.5
                        # check for negation words and flip the sign of sentiment
                        if (child.dep_ == "neg"): 
                            sentiment *= -1
                    
                    # check for nouns
                    for child in token.head.children:
                        noun = ""
                        if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                            noun = child.text
                            # Check for compound nouns
                            for subchild in child.children:
                                if subchild.dep_ == "compound":
                                    noun = subchild.text + " " + noun
                            sent_dict[noun] += sentiment
                        debug += 1
    return sent_dict

In [32]:
sentence= "I came here with my friends on a Tuesday night. The sushi here is amazing. Our waiter was very helpful, but the music was terrible."
feature_sentiment(sentence)

TypeError: Argument 'string' has incorrect type (expected str, got spacy.tokens.span.Span)