In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.tokenizer import tokenize
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
# read the dataset
reviews_data = pd.read_json(r"C:\Users\user\Documents\UoM\Thesis\data\Clothing_Shoes_and_Jewelry_5.json", lines=True)

# Keep only the review text and the grade
reviews_data = reviews_data[['overall', 'reviewText']]


# Drop the products whose values are null
reviews_data = reviews_data[reviews_data['reviewText'].notna()]

In [3]:
ratings = []
for index,entry in enumerate(reviews_data['overall']):
    if entry == 1.0 or entry == 2.0:
        ratings.append(-1)
    elif entry == 3.0:
        ratings.append(0)
    elif entry == 4.0 or entry == 5.0:
        ratings.append(1)

In [4]:
""""
Split each review into sentences
and preprocess each sentence
"""
stopwords = stopwords.words('english')

preprocessed_data = []
index = 0
for review in reviews_data['reviewText']:
    if(index % 10000 == 0):
        print(index)
    review_sentences = tokenizer.tokenize(review)
    for sentence in review_sentences:
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if not word in stopwords]
        sentence = ' '.join(sentence)
        if(sentence != ''):
            review = {}
            review["index"] = index
            review["sentence"] = sentence
            review["rating"] = ratings[index]
            preprocessed_data.append(review)
    index += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000


In [5]:
print("Before preprocessing: ", reviews_data['reviewText'][75945 ])
print("------------------------------------------------")
print("After preprocessing: ", [d for d in preprocessed_data if d['index'] == 75945 ])
print(len(preprocessed_data))

Before preprocessing:  These socks fall down past the heel.  Normally this brand is good, but these did not work out at all.
------------------------------------------------
After preprocessing:  [{'index': 75945, 'sentence': 'socks fall past heel', 'rating': -1}, {'index': 75945, 'sentence': 'normally brand good work', 'rating': -1}]
1172750


In [6]:
# Split Dataset into training and test
indexes = [i for i in range(len(reviews_data))]

Train_X_index, Test_X_index, Train_Y_review, Test_Y_review = model_selection.train_test_split(indexes,ratings,test_size=0.3)

df = pd.DataFrame(preprocessed_data)

train = df[df['index'].isin(Train_X_index)]
Train_Y = train['rating'].tolist()
Train_X = train['sentence'].tolist()
Train_index = train['index'].tolist()


test = df[df['index'].isin(Test_X_index)]
Test_Y = test['rating'].tolist()
Test_X = test['sentence'].tolist()
Test_index = test['index'].tolist()

all_sentences = Train_X + Test_X

In [7]:
len(df)

1172750

In [8]:
# Encoding of label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Train_Y_review = Encoder.fit_transform(Train_Y_review)
Test_Y_review = Encoder.fit_transform(Test_Y_review)

In [9]:
# Word Vectorization
        
Hashing_vect = HashingVectorizer(alternate_sign=False)
Hashing_vect.fit(all_sentences)
Train_X_Hashing = Hashing_vect.transform(Train_X)
Test_X_Hashing = Hashing_vect.transform(Test_X)

In [10]:
def find_Max_Element(scores):
    c = Counter(scores)
    negatives = c[0]
    neutral = c[1]
    positives = c[2]

    if(neutral == positives and neutral > negatives and positives > negatives):
        max_element = 1
    elif (neutral == negatives and neutral > positives and negatives > positives):
        max_element = 1
    elif (neutral == negatives and negatives == positives):
        max_element = 1
    elif (positives == negatives and positives > neutral and negatives > neutral) :
        max_element = 1
    else:
        max_element = max(set(scores), key = scores.count)
        
    return max_element
#     return  max(set(scores), key = scores.count)

In [11]:
# Support Vector Machine with linear cernel

from sklearn.svm import LinearSVC
import timeit

start = timeit.default_timer()

# Train the classifier
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(Train_X_Hashing,Train_Y)

stop = timeit.default_timer()

print("Training is done. Time: ", (stop-start)) 

predictions_SVM = clf.predict(Test_X_Hashing)

print("Predicting is done")

# Get grades by review
predicted_scores_frame = pd.DataFrame(np.column_stack([Test_index, predictions_SVM, Test_Y]), 
                               columns=['review Index', 'prediction', 'actual'])

print(len(predicted_scores_frame['review Index'].unique()) - len(Test_Y_review))

# for each review get all scores by review
scores_by_review_frame = predicted_scores_frame.groupby('review Index')['prediction'].apply(list)

majority_scores =  scores_by_review_frame.apply(find_Max_Element)
predicted_scores = list(majority_scores)

# for each review get its actual score
actual_scores_frame = predicted_scores_frame.groupby('review Index')['actual'].first()
actual_scores = list(actual_scores_frame)

# Calculate Accuracy
accuracy = accuracy_score(predicted_scores, actual_scores)*100
print("Accuracy: ", accuracy)
print("-----------------------")

# Confusion Matrix
cm = confusion_matrix(actual_scores, predicted_scores)
print("Confusion matrix: ", cm)
print("------------------------")

# Classification Report
my_tags = ['Negative','Neutral','Positive']
report = classification_report(actual_scores, predicted_scores, target_names=my_tags);
print(report)

Training is done. Time:  54.18699330000004
Predicting is done
-10
Accuracy:  80.46869392540134
-----------------------
Confusion matrix:  [[  536   685  6738]
 [   84   382  8579]
 [   28   213 66349]]
------------------------
              precision    recall  f1-score   support

    Negative       0.83      0.07      0.12      7959
     Neutral       0.30      0.04      0.07      9045
    Positive       0.81      1.00      0.90     66590

    accuracy                           0.80     83594
   macro avg       0.65      0.37      0.36     83594
weighted avg       0.76      0.80      0.73     83594

