In [1]:
from tqdm import tqdm
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

import re
import string
import numpy as np
import contractions
from num2words import num2words
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tqdm import tqdm

# VADER rule-based method

The same clean_text preprocessing function as used in the data preprocessing notebook. This is so we can type our own reviews and see what we get. However, we are not stemming or removing the stop words as VADER is trained on microblog data so the text should read as normal unprocessed English. We also do not want to convert to lower case as VADER uses all caps as a sentiment modifier.

In [3]:
def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))     
   
    return text

In [4]:
df = pd.read_csv('../Data/preprocessed_data.csv')
df.head()

Unnamed: 0,overall,reviewText,asin,preprocessed_text
0,5,I don't spend a lot on my flags because they r...,9539723809,spend lot flag realli get beat lesser qualiti ...
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH,dear friend mine slowli lose sight pen make po...
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH,absolut exquisit made cardboard like descript ...
3,4,"This is really nice to use, however, just not ...",B00000JSZH,realli nice use howev color saddl shimmer give...
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH,angel beauti glad chose one even beauti look o...


In [5]:
df.isnull().mean()

overall              0.0
reviewText           0.0
asin                 0.0
preprocessed_text    0.0
dtype: float64

In [6]:
df.dropna(inplace=True)

This function converts a sentiment score into a rating.

In [7]:
def catagorise_data(sentiment):
    if sentiment >=-1 and sentiment < -0.5:
        return 1
    
    if sentiment >= -0.5 and sentiment < 0:
        return 2
    
    if sentiment == 0:
        return 3
    
    if sentiment > 0 and sentiment < 0.5:
        return 4
    
    if sentiment >= 0.5 and sentiment <= 1:
        return 5

In [8]:
sid = SentimentIntensityAnalyzer()

Looking at an individual example of how VADER works, we can see that 'compound' score is posative, meaning the overall sentiment is posative which is correct for the input sentence.

In [9]:
testText = clean_text("Good product, well put together, great presentation")
print(testText)
testScore = sid.polarity_scores(testText)
print(testScore)

good product well put together great presentation
{'neg': 0.0, 'neu': 0.305, 'pos': 0.695, 'compound': 0.8442}


In [10]:
print(catagorise_data(testScore['compound']))

5


We run the data through clean_text, then get a sentiment score and then convert it into a rating.

In [11]:
def generate_rating(review):
    preprocessed_text = clean_text(review)
    tmp = sid.polarity_scores(preprocessed_text)
    predicted_rating = catagorise_data(tmp['compound'])
    return predicted_rating

In [12]:
print(generate_rating("Good product, works well. However, broke after only 1 use. I'm happy"))

5


In [13]:
tqdm.pandas()
df['vader_rating'] = df['reviewText'].progress_apply(generate_rating)
df.head()

100%|█████████████████████████████████████████████████████████████████████████| 179642/179642 [03:04<00:00, 975.33it/s]


Unnamed: 0,overall,reviewText,asin,preprocessed_text,vader_rating
0,5,I don't spend a lot on my flags because they r...,9539723809,spend lot flag realli get beat lesser qualiti ...,5
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH,dear friend mine slowli lose sight pen make po...,5
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH,absolut exquisit made cardboard like descript ...,5
3,4,"This is really nice to use, however, just not ...",B00000JSZH,realli nice use howev color saddl shimmer give...,5
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH,angel beauti glad chose one even beauti look o...,4


In [14]:
df.describe()

Unnamed: 0,overall,vader_rating
count,179642.0,179642.0
mean,4.088621,4.172721
std,1.307608,1.393277
min,1.0,1.0
25%,4.0,4.0
50%,5.0,5.0
75%,5.0,5.0
max,5.0,5.0


In [15]:
df.to_csv('../Data/vader.csv', index=False)