In [251]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.tokenizer import tokenize
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [257]:
# read the dataset
reviews_data = pd.read_json(r"C:\Users\user\Documents\UoM\Thesis\data\All_Beauty_5.json", lines=True)

# Keep only the review text and the grade
reviews_data = reviews_data[['overall', 'reviewText']]

# Drop the products whose values are null
reviews_data = reviews_data[reviews_data['reviewText'].notna()]

In [258]:
ratings = []
for index,entry in enumerate(reviews_data['overall']):
    if entry == 1.0 or entry == 2.0:
        ratings.append(-1)
    elif entry == 3.0:
        ratings.append(0)
    elif entry == 4.0 or entry == 5.0:
        ratings.append(1)

In [301]:
""""
Split each review into sentences
and preprocess each sentence
"""
preprocessed_data = []
index = 0
for review in reviews_data['reviewText']:
    # Split each review into sentences
    review_sentences = tokenizer.tokenize(review)
    for sentence in review_sentences:
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if not word in set(stopwords.words('english'))]
        sentence = ' '.join(sentence)
        review = {}
        review["index"] = index
        review["sentence"] = sentence
        review["rating"] = ratings[index]
        preprocessed_data.append(review)
    index += 1

In [302]:
print("Before preprocessing: ", reviews_data['reviewText'][34])
print("------------------------------------------------")
print("After preprocessing: ", [d for d in preprocessed_data if d['index'] == 34])

Before preprocessing:  My product was not sealed and either used or something.. attached are pictures. Would like a refund please.
------------------------------------------------
After preprocessing:  [{'index': 34, 'sentence': 'product sealed either used something attached pictures', 'rating': -1}, {'index': 34, 'sentence': 'would like refund please', 'rating': -1}]


In [303]:
# Split Dataset into training and test
indexes = [i for i in range(len(reviews_data))]

Train_X_index, Test_X_index, Train_Y_review, Test_Y_review = model_selection.train_test_split(indexes,ratings,test_size=0.3)

Train_X = [d['sentence'] for d in preprocessed_data if d['index'] in Train_X_index]
Train_Y = [d['rating'] for d in preprocessed_data if d['index'] in Train_X_index]
Test_X = [d['sentence'] for d in preprocessed_data if d['index'] in Test_X_index]
Test_Y = [d['rating'] for d in preprocessed_data if d['index'] in Test_X_index]
Train_index = [d['index'] for d in preprocessed_data if d['index'] in Train_X_index]
Test_index = [d['index'] for d in preprocessed_data if d['index'] in Test_X_index]

all_sentences = Train_X + Test_X

In [304]:
len(reviews_data.notna())

5264

In [305]:
# Encoding of label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Train_Y_review = Encoder.fit_transform(Train_Y_review)
Test_Y_review = Encoder.fit_transform(Test_Y_review)

In [306]:
# Word Vectorization
        
Hashing_vect = HashingVectorizer(alternate_sign=False)
Hashing_vect.fit(all_sentences)
Train_X_Hashing = Hashing_vect.transform(Train_X)
Test_X_Hashing = Hashing_vect.transform(Test_X)

In [309]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Hashing,Train_Y)

# predict the labels on validation dataset
predictions_NB_sentences = Naive.predict(Test_X_Hashing)

# Get grades by review
predicted_scores_frame = pd.DataFrame(np.column_stack([Test_index, predictions_NB_sentences]), 
                               columns=['review Index', 'prediction'])

#find the majority score by review
reviews_sentiment = []
for item in predicted_scores_frame['review Index'].unique():
    scores = list(predicted_scores_frame[predicted_scores_frame['review Index'] == item]['prediction'])
    majority_score = max(set(scores), key = scores.count)
    reviews_sentiment.append(majority_score)

# Calculate Accuracy
accuracy = accuracy_score(reviews_sentiment, Test_Y_review)*100

# Confusion Matrix
cm = confusion_matrix(Test_Y_review, reviews_sentiment)

# Classification Report
my_tags = ['Negative','Neutral','Positive']
report = classification_report(Test_Y_review, reviews_sentiment, target_names=my_tags)


  'precision', 'predicted', average, warn_for)


In [300]:
print(accuracy)

93.86075949367088
