# Aspect Based Sentiment Analysis
## Lexicon Approach

Create a domain opinion lexicon to highlight positive and negative words that may have a different connotation with respect to the domain.

Using an opinion lexicon coupled with the domain opinion words:

See aspect_sentiment method for the main process.

In [471]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import sys
sys.path.append('scripts/')
from utility import *

from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm") # English Model

RANDOM_STATE = 42

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('data/Skyrim_Reviews.csv', usecols=['review', 'rating'])
data.head()

Unnamed: 0,rating,review
0,Recommended,"This is the game the never, ever ends. I picke..."
1,Recommended,Ruined my life. Five stars.
2,Recommended,I was stacking books on a shelf in my house in...
3,Recommended,"Best game I ever bought. In this game, you'll ..."
4,Recommended,Playing Skyrim is like masturbating. Feels goo...


In [5]:
def custom_sentencizer(doc):
    
    boundary = re.compile('\.{2,}')
    digits = re.compile('^[0-9]*$')
    
    for token in doc[:-1]:
        if boundary.match(token.text) or digits.match(token.text) or token.text in [',', '!', '?', ':']:
            doc[token.i+1].is_sent_start = False
                
    return doc

In [6]:
nlp.add_pipe(custom_sentencizer, before="parser")

## Data Preprocess

In [7]:
norm_data = normalise_text(data['review'], All=False, expand_contraction= True,\
                           remove_special_character=True, remove_digit=False, remove_white_spaces=True)

## Create Domain Opinion Lexicon

### Information Gain
Measures the reduction in uncertainty for one variable given a known value of the other variable.

I(X ; Y) = H(X) – H(X | Y)

Where I(X ; Y) is the mutual information for X and Y, H(X) is the entropy for X and H(X | Y) is the conditional entropy for X given Y.


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=spacy_stopwords, lowercase=True)

word_vec = vectorizer.fit_transform(norm_data)

In [9]:
vocab = vectorizer.get_feature_names()
word_data = pd.DataFrame(word_vec.toarray(), columns = vocab)

In [10]:
from sklearn.feature_selection import mutual_info_classif

word_IG = dict(zip(vocab,
               mutual_info_classif(word_data, data['rating'],
                                   discrete_features=True,
                                   random_state=RANDOM_STATE
                                  )
               ))

In [86]:
IG_data = pd.DataFrame(sorted(word_IG.items(), key=lambda x: x[1], reverse=True), columns=['Word','Information Gain'])
IG_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16292,16293,16294,16295,16296,16297,16298,16299,16300,16301
Word,mods,paid,valve,pay,best,world,quests,10/10,dragons,rpg,...,winds,winner,wiping,wishing,wit,witnessing,worship,wrapped,wrath,wth
Information Gain,0.119875,0.119517,0.0938011,0.0461954,0.0371711,0.0328329,0.0323099,0.0305012,0.0282482,0.0281762,...,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08,5.10497e-08


#### Get Adjectives/Num words

In [87]:
domain_opinion_words = [word for word in IG_data['Word'] if nlp(word)[0].pos_ in ['ADJ','NUM']]

#### Add positive and negative domain opinion words based on domain knowledge

In [497]:
pos_domain_opinion_words = ['unfathomable','outstandddding''vivid','ominous','sneaky','11/10','9.5/10','9/10','10/10','long']
neg_domain_opinion_words = ['2/10','5/10','short']

### Opinion Words Lexicon

https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

In [495]:
# Opinion words
neg_file = open("data/negative-words.txt")
pos_file = open("data/positive-words.txt")
neg = [line.strip() for line in neg_file.readlines()]
pos = [line.strip() for line in pos_file.readlines()]

opinion_words = (pos + pos_domain_opinion_words) + (neg + neg_domain_opinion_words)

#### Decreasing words

In [326]:
# Quantity words
decreasing_words = ['less', 'disappeared', 'removal', 'disappeared','reduced','short','scaled']

In [473]:
stop_words = {'thing','things','something','it','than'}

def remove_stop_word_aspects(d: dict, stopwords=stop_words):
    """
    Remove stop word keys from dictionary
    """
    for word in stopwords:
        try:
            del d[word]
        except:
            pass

### Aspect Sentiment Method

In [529]:
def aspect_sentiment(text: str):
    """
    returns Counter with the aspect and sentiment value based on the rules specified
    """
    sent_dict = Counter()
    sentiment = 0
    for sentence in sentence_tokenize(text):
        # Sentences with '?' are handled as questions and thus not opinionated
        if '?' in str(sentence):
            continue
        sentence_tokens = nlp(str(sentence))
        for token in sentence_tokens:
            # Left and rights word(s) used to target sentiment shifters such as 'no'/decreasing words
            if token.i != len(sentence_tokens) - 1:
                right_word = str(sentence_tokens[token.i+1])
            else:
                right_word = ''
            if token.i != 0:
                left_word = str(sentence_tokens[token.i-1])
            else:
                left_word = ''
            # check if the word is an opinion word, then assign sentiment
            if token.text in opinion_words:
                if token.text in pos:
                    if right_word in decreasing_words or left_word in decreasing_words:
                        sentiment = -1
                    else:
                        sentiment = 1
                if token.text in neg:
                    if right_word in decreasing_words or left_word in decreasing_words:
                        sentiment = 1
                    else:
                        sentiment = -1
                #if target is an adverb modifier (i.e. pretty, highly, etc.)
                # but happens to be an opinion word, ignore and pass
                if token.dep_ == "advmod":
                    continue
                elif token.dep_ == "amod":
                    sent_dict[token.head.text] += sentiment
                elif token.dep_ == "nsubj":
                    if token.head.text in decreasing_words:
                        sentiment *= -1
                        sent_dict[token.text] += sentiment
                # for opinion words that are adjectives, adverbs, verbs...
                else:
                    for child in token.children:
                        # if there's a adj modifier (i.e. very, pretty, etc.) add more weight to sentiment
                        # This could be better updated for modifiers that either positively or negatively emphasize
                        if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                            sentiment *= 1.5
                        # check for negation words and flip the sign of sentiment
                        if child.dep_ == "neg" or child.text == 'no' or child.text in decreasing_words:
                            sentiment *= -1
                        # if verb, check if there's a direct object and it's not a pronoun
                        if (token.pos_ == "VERB") and (child.dep_ == "dobj" and child.pos_ != 'PRON'):
                            sent_dict[child.text] += sentiment
                            # check for conjugates (a AND b), then add both to dictionary
                            subchildren = []
                            conj = 0
                            for subchild in child.children:
                                if subchild.text == "and":
                                    conj=1
                                if (conj == 1) and (subchild.text != "and"):
                                    subchildren.append(subchild.text)
                                    conj = 0
                            for subchild in subchildren:
                                sent_dict[subchild] += sentiment
    
                    # check for negation
                    for child in token.head.children:
                        noun = ""
                        if ((child.dep_ == "amod") or (child.dep_ == "advmod")) and (child.text in opinion_words):
                            sentiment *= 1.5
                        # check for negation words and flip the sign of sentiment
                        if (child.dep_ == "neg"):
                            sentiment *= -1
                        # check for nouns
                        conj = 0
                        if (child.pos_ == "NOUN") and (child.text not in sent_dict):
                            noun = child.text
                            # Check for compound nouns
                            for subchild in child.children:
                                if subchild.dep_ == "compound":
                                    noun = subchild.text + " " + noun
                                if subchild.text == "and":
                                    conj=1
                                if (conj == 1) and (subchild.text != "and"):
                                    sent_dict[subchild] += sentiment
                                    conj = 0
                            sent_dict[noun] += sentiment
    remove_stop_word_aspects(sent_dict)
    return sent_dict

In [530]:
from time import perf_counter
start = perf_counter()
counter = Counter()
for review in data['review']:
    counter.update(aspect_sentiment(expand_contractions(review)))
end = perf_counter()
execution_time = (end - start)
print(f'{len(counter)} Aspects - Runtime: {execution_time:.2f}s / {execution_time/60:.2f}mins')

4150 Aspects - Runtime: 479.62s / 7.99mins


In [531]:
def filter_keywords(counter, n: int):
    """
    Get aspects with atleast more than n sentiment values 
    """
    for key in counter.copy():
        if abs(counter[key]) <= n:
            del counter[key]
    print(f'Aspects: {len(counter)}')
    return counter

In [532]:
aspect_dict = filter_keywords(counter, 10)

Aspects: 200


In [533]:
pos_aspect = [(aspect,value) for (aspect,value) in aspect_dict.items() if value > 0]
neg_aspect = [(aspect,value) for (aspect,value) in aspect_dict.items() if value < 0]

## Positive Aspects

In [534]:
pd.DataFrame(pos_aspect, columns=['Aspect', 'Sentiment_value']).sort_values(by='Sentiment_value', ascending=False).T

Unnamed: 0,31,1,69,32,58,54,50,52,7,73,48,42,24,51,85,9,72,46,25,92,43,61,57,56,40,35,64,71,62,13,80,79,84,88,4,38,103,59,55,22,96,37,110,26,19,105,66,94,83,39,106,3,65,70,21,60,23,93,29,77,44,86,17,6,33,89,81,112,95,8,18,108,98,97,100,101,45,12,47,91,104,90,109,107,0,82,27,10,11,20,30,75,114,14,113,2,5,99,78,28,102,16,15,67,111,87,76,68,34,36,41,49,63,74,53
Aspect,game,games,mods,fun,graphics,experience,work,RPG,magic,Skyrim,skill,love,story,for,community,freedom,RPGs,world,support,modders,gameplay,masterpiece,content,top,gold,quests,part,variety,Mods,one,storyline,fans,way,hero,combat,armor,music,creators,amount,advantage,job,juice,adventure,time,reward,lore,enjoyment,soundtrack,passion,weapons,Oblivion,bonus,series,improvement,sword,button,words,favor,setting,place,skyrim,master,excitement,user,toon,travel,version,Graphics,good,sense,run,line,landscapes,modding community,improvements,alot,rolls,rewards,mechanics,faith,gamer,modding,beauty,conscience,purchase,atmosphere,time fans,soundtracks,parameters,abilities,race,feature,value,blasts,liking,Acrobatics skill,Elf regeneration,right,scenery,denser,stories,attack patterns,circles,DLCs,luck,perfection,progress,success,exploration works,Lizards,games fun,PC,joy,character,quality
Sentiment_value,1348,346,236.5,228,113.5,102.5,100.5,89,86,84,80,76.5,75,72,71.5,70.5,66,63.5,60,58,55,48,48,47.5,46.5,44.5,44.5,42.5,38,38,36,33,30.75,29.5,29.25,29,27,24.5,24,23.5,23,22,22,21,21,20,20,20,20,20,19,19,18,18,18,17.5,17,17,17,17,17,17,17,17,16.5,16,15.5,15,15,15,15,15,15,14.5,14.5,14.5,14,14,14,13,13,13,13,13,13,13,13,13,13,13,13,13,12,12,12,12,12,12,12,12,12,12,12,11.5,11,11,11,11,11,11,11,11,11,11,10.75


# Negative Aspects

In [535]:
pd.DataFrame(neg_aspect, columns=['Aspect', 'Sentiment_value']).sort_values(by='Sentiment_value').T

Unnamed: 0,49,30,16,65,14,46,50,56,51,5,27,42,37,70,25,52,79,62,45,55,40,72,34,53,60,38,36,47,63,35,64,59,84,78,33,54,13,48,39,82,3,67,7,32,44,17,73,58,80,6,68,66,4,83,9,8,75,69,61,0,41,19,1,77,76,18,74,10,26,2,15,24,20,12,23,11,22,57,71,28,43,29,81,31,21
Aspect,bugs,death,dungeons,greed,enemies,issues,problems,buggy,crap,dragon,hell,glitches,review,lack,dragons,issue,publicity,idea,dungeon,bug,chicken,shame,difficulty,damage,practice,loot,cave,enemy,garbage,people,Valve,mess,behavior,practices,life,mistake,scream,decision,assassin,destruction,rage,%,attack,tank,flaws,assault,plot,joke,scam,ruins,business,crashes,choice,prisoner,frost,foe,loss,precedent,move,touch,reviews,state,trolls,evil,UI,thought,fear,creature,playtroughs,character creation,ice,demon,College librarian,flap,realm,arrows,cross,mystery,others,wife,conflict,sorrow,pay,boss,Characters
Sentiment_value,-181.75,-91.5,-76,-71.5,-60.5,-58,-58,-51.5,-49,-43,-42,-40,-39,-39,-38,-36,-36,-34,-33.5,-33,-29,-28,-27,-26.5,-25,-25,-25,-24,-24,-24,-22.25,-22,-22,-20,-19,-19,-19,-19,-18,-17,-17,-17,-17,-17,-17,-17,-16.5,-16.5,-16,-16,-15.5,-15,-15,-15,-15,-14,-13.5,-13,-13,-13,-13,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-12,-11.75,-11.5,-11,-11,-11,-11,-11,-11
