In [77]:
# importing libraries
import pandas as pd
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from string import punctuation

from src.nlp_preprocessing import pos_tag
from src.nlp_preprocessing import penn_to_wn

In [78]:
df = pd.read_csv('../data/preprocessed_small_sample.csv')

In [79]:
def get_sentiment_score(text):
    
    """
        This method returns the sentiment score of a given text using SentiWordNet sentiment scores.
        input: text
        output: numeric (double) score, >0 means positive sentiment and <0 means negative sentiment.
    """    
    total_score = 0
    #print(text)
    raw_sentences = sent_tokenize(text)
    #print(raw_sentences)
    
    for sentence in raw_sentences:

        sent_score = 0     
        sentence = str(sentence)
        #print(sentence)
        sentence = sentence.replace("<br />"," ").translate(str.maketrans('','',punctuation)).lower()
        tokens = TreebankWordTokenizer().tokenize(text)
        tags = pos_tag(tokens)
        for word, tag in tags:
            wn_tag = penn_to_wn(tag)
            if not wn_tag:
                continue
            lemma = WordNetLemmatizer().lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sent_score += swn_synset.pos_score() - swn_synset.neg_score()

        total_score = total_score + (sent_score / len(tokens))

    
    return (total_score / len(raw_sentences)) * 100

In [80]:
# testing
swn.senti_synset(wn.synsets("amazing", wn.ADJ)[0].name()).pos_score()

0.5

In [45]:
# testing
synsets = swn.senti_synsets('abandon')

for i in synsets:
    print("POS score:", i.pos_score())
    print("NEG score:", i.neg_score())
    print("POS OBJ:", i.obj_score())
    print('Overall Score:', i.pos_score() - i.neg_score())
    print()

POS score: 0.0
NEG score: 0.375
POS OBJ: 0.625
Overall Score: -0.375

POS score: 0.125
NEG score: 0.375
POS OBJ: 0.5
Overall Score: -0.25

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.0
NEG score: 0.125
POS OBJ: 0.875
Overall Score: -0.125

POS score: 0.0
NEG score: 0.375
POS OBJ: 0.625
Overall Score: -0.375



In [46]:
# testing
synsets = swn.senti_synsets('love')

for i in synsets:
    print("POS score:", i.pos_score())
    print("NEG score:", i.neg_score())
    print("POS OBJ:", i.obj_score())
    print('Overall Score:', i.pos_score() - i.neg_score())
    print()

POS score: 0.625
NEG score: 0.0
POS OBJ: 0.375
Overall Score: 0.625

POS score: 0.375
NEG score: 0.0
POS OBJ: 0.625
Overall Score: 0.375

POS score: 0.125
NEG score: 0.0
POS OBJ: 0.875
Overall Score: 0.125

POS score: 0.25
NEG score: 0.0
POS OBJ: 0.75
Overall Score: 0.25

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.0
NEG score: 0.0
POS OBJ: 1.0
Overall Score: 0.0

POS score: 0.5
NEG score: 0.0
POS OBJ: 0.5
Overall Score: 0.5

POS score: 1.0
NEG score: 0.0
POS OBJ: 0.0
Overall Score: 1.0

POS score: 0.625
NEG score: 0.0
POS OBJ: 0.375
Overall Score: 0.625

POS score: 0.375
NEG score: 0.125
POS OBJ: 0.5
Overall Score: 0.25



In [81]:
display('Before dropping nulls:',df.shape)
df = df.dropna(subset=['cleaned_review'])
display('After dropping nulls:',df.shape)

'Before dropping nulls:'

(9000, 16)

'After dropping nulls:'

(8998, 16)

In [82]:
df['sentiment_score'] = df['cleaned_review'].apply(lambda text : get_sentiment_score(text))

In [85]:
df[['cleaned_review','lemmas','sentiment_score']].sample(5)

Unnamed: 0,cleaned_review,lemmas,sentiment_score
3086,this card wouldve been cute in a silly kind of way if the soundmusic was recognizable maybe i just got a defective card but its not the song thats played in the sample in the description its not really a song or music at all just unrecognizable noise,"['card', 'wouldve', 'be', 'cute', 'silly', 'kind', 'way', 'soundmusic', 'be', 'recognizable', 'maybe', 'i', 'just', 'get', 'defective', 'card', 'not', 'song', 'thats', 'played', 'sample', 'description', 'not', 'really', 'song', 'music', 'just', 'unrecognizable', 'noise']",0.520833
6695,love it,['love'],25.0
980,i ordered this scale before and the first one didnt work at all amazon sent a replacement and this one lasted a couple months and it started giving wild weight readings now it only reads error fitbit does not respond to emails regarding this dont waste your money on this scale,"['i', 'order', 'scale', 'first', 'didnt', 'work', 'amazon', 'sent', 'replacement', 'last', 'couple', 'month', 'start', 'give', 'wild', 'weight', 'reading', 'now', 'only', 'read', 'error', 'fitbit', 'do', 'not', 'respond', 'email', 'regard', 'dont', 'waste', 'money', 'scale']",-1.715686
6317,its exactly what i have been looking for cover is made of good quality plastic love it,"['exactly', 'i', 'have', 'be', 'look', 'cover', 'be', 'make', 'good', 'quality', 'plastic', 'love']",10.294118
5,super thin bubbles pop really easily our breakage rate in shipping went way up using this bubble wrap went back to using blue hawk bubble wrap from lowes much tougher and tears far easier,"['super', 'thin', 'bubble', 'pop', 'really', 'easily', 'breakage', 'rate', 'shipping', 'go', 'way', 'up', 'use', 'bubble', 'wrap', 'go', 'back', 'use', 'blue', 'hawk', 'bubble', 'wrap', 'lowes', 'much', 'tougher', 'tear', 'far', 'easy']",0.735294
