In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
# read the dataset
reviews_data = pd.read_json(r"C:\Users\user\Documents\UoM\Thesis\data\Clothing_Shoes_and_Jewelry_5.json", lines=True)

# Keep only the review text and the grade
reviews_data = reviews_data[['overall', 'reviewText']]


# Drop the products whose values are null
reviews_data = reviews_data[reviews_data['reviewText'].notna()]

In [3]:
ratings = []
for index,entry in enumerate(reviews_data['overall']):
    if entry == 1.0 or entry == 2.0:
        ratings.append(-1)
    elif entry == 3.0:
        ratings.append(0)
    elif entry == 4.0 or entry == 5.0:
        ratings.append(1)

In [4]:
""""
Split each review into sentences
and preprocess each sentence
"""
stopwords = stopwords.words('english')

preprocessed_data = []
index = 0
for review in reviews_data['reviewText']:
    if(index % 10000 == 0):
        print(index)
    review_sentences = tokenizer.tokenize(review)
    for sentence in review_sentences:
        if(sentence != ''):
            review = {}
            review["index"] = index
            review["sentence"] = sentence
            review["rating"] = ratings[index]
            preprocessed_data.append(review)
    index += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000


In [5]:
print("Before preprocessing: ", reviews_data['reviewText'][278643])
print("------------------------------------------------")
print("After preprocessing: ", [d for d in preprocessed_data if d['index'] == 278643])
print(len(preprocessed_data))

Before preprocessing:  idk why this shirt didnt fit i mean i ordered an Xlits very short and it seems like it was cut weird because the cross sits funny, like sideways.also one of the studs fell off the second i tried it on. ahhhhhhh whatever thats what i get for ordering clothes online loli DO NOT recommend
------------------------------------------------
After preprocessing:  [{'index': 278643, 'sentence': 'idk why this shirt didnt fit i mean i ordered an Xlits very short and it seems like it was cut weird because the cross sits funny, like sideways.also one of the studs fell off the second i tried it on.', 'rating': -1}, {'index': 278643, 'sentence': 'ahhhhhhh whatever thats what i get for ordering clothes online loli DO NOT recommend', 'rating': -1}]
1185548


In [6]:
df = pd.DataFrame(preprocessed_data)

In [7]:
analyzer = SentimentIntensityAnalyzer()

def calculate_vader_score(sentence):
    scores_dict = analyzer.polarity_scores(sentence)
    return scores_dict['compound']

In [8]:
review = [d for d in preprocessed_data if d['index'] == 234584]
print(review)
for sentence in review:
    print(calculate_vader_score(sentence['sentence']))
print(calculate_vader_score(reviews_data['reviewText'][234584]))

[{'index': 234584, 'sentence': "The color in the picture doesn't do the sandals justice.", 'rating': 1}, {'index': 234584, 'sentence': 'They are a beautiful beige/taupe sandal.', 'rating': 1}, {'index': 234584, 'sentence': 'The faux patent leather adds a nice shine.', 'rating': 1}, {'index': 234584, 'sentence': 'The buckle on top is a brushed silver which adds just the right amount of flare.', 'rating': 1}, {'index': 234584, 'sentence': "What's great about these sandals is they aren't completely flat (for those with arch problems).", 'rating': 1}, {'index': 234584, 'sentence': 'They have just the right heal height (about an 1 1/2 inches).', 'rating': 1}, {'index': 234584, 'sentence': 'They also have just the right amount of cushion.', 'rating': 1}, {'index': 234584, 'sentence': 'The center strap is a little tight on my foot, but will hopefully get loser with wear.', 'rating': 1}, {'index': 234584, 'sentence': 'Cute sandal!', 'rating': 1}]
0.5267
0.5994
0.4215
0.2023
0.34
0.0
0.0
-0.261

NameError: name 'prinr' is not defined

In [20]:
def get_sentiment_from_score(predicted_score):
    if predicted_score > 0:
        return 1
    elif predicted_score < 0:
        return -1
    else:
        return 0

In [None]:
# Calculate predicted scores using afinn lexicon approcach
df['predicted_score'] = df.apply(lambda x: calculate_vader_score(x['sentence']), axis=1)

# Get the sentiment from the caluclated score
df['predicted_sentiment'] = df.apply(lambda x: get_sentiment_from_score(x['predicted_score']), axis=1)

In [None]:
df.head()

In [None]:
def find_Max_Element(scores):
    c = Counter(scores)
    negatives = c[-1]
    neutral = c[0]
    positives = c[1]

    if(neutral == positives and neutral > negatives and positives > negatives):
        max_element = 1
    elif (neutral == negatives and neutral > positives and negatives > positives):
        max_element = -1
    elif (neutral == negatives and negatives == positives):
        max_element = 0
    elif (positives == negatives and positives > neutral and negatives > neutral) :
        max_element = 0
    else:
        max_element = max(set(scores), key = scores.count)
        
    return max_element


In [None]:
# Report 

# for each review get all scores by review
scores_by_review_frame = df.groupby('index')['predicted_sentiment'].apply(list)
majority_scores =  scores_by_review_frame.apply(find_Max_Element)
predicted_scores = list(majority_scores)

# for each review get its actual score
actual_scores_frame = df.groupby('index')['rating'].first()
actual_scores = list(actual_scores_frame)

# Calculate Accuracy
accuracy = accuracy_score(predicted_scores, actual_scores)*100
print("Accuracy: ", accuracy)
print("----------------------")

# Confusion Matrix
cm = confusion_matrix(actual_scores, predicted_scores)
print("Confusion matrix: ", cm)
print("-----------------------")

# Classification Report
my_tags = ['Negative','Neutral','Positive']
report = classification_report(actual_scores, predicted_scores, target_names=my_tags);
print(report)