In [1]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
# from nltk.tokenizer import tokenize
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter
from imblearn.over_sampling import SMOTE
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
# read the dataset
reviews_data = pd.read_json(r"C:\Users\user\Documents\UoM\Thesis\data\Clothing_Shoes_and_Jewelry_5.json", lines=True)

# Keep only the review text and the grade
reviews_data = reviews_data[['overall', 'reviewText']]


# Drop the products whose values are null
reviews_data = reviews_data[reviews_data['reviewText'].notna()]

In [3]:
ratings = []
for index,entry in enumerate(reviews_data['overall']):
    if entry == 1.0 or entry == 2.0:
        ratings.append(-1)
    elif entry == 3.0:
        ratings.append(0)
    elif entry == 4.0 or entry == 5.0:
        ratings.append(1)

In [4]:
""""
Split each review into sentences
and preprocess each sentence
"""
stopwords = stopwords.words('english')

preprocessed_data = []
index = 0
for review in reviews_data['reviewText']:
    if(index % 10000 == 0):
        print(index)
    review_sentences = tokenizer.tokenize(review)
    for sentence in review_sentences:
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        sentence = sentence.lower()
        sentence = word_tokenize(sentence)
        sentence = [word for word in sentence if not word in stopwords]
        sentence = ' '.join(sentence)
        if(sentence != ''):
            review = {}
            review["index"] = index
            review["sentence"] = sentence
            review["rating"] = ratings[index]
            preprocessed_data.append(review)
    index += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000


In [5]:
print("Before preprocessing: ", reviews_data['reviewText'][32 ])
print("------------------------------------------------")
print("After preprocessing: ", [d for d in preprocessed_data if d['index'] == 32 ])
print(len(preprocessed_data))

Before preprocessing:  Okay, I admit it. I'm one of the millions of Americans that massively struggles with any language either than English. I have recently returned to college to finish out my Bachelors. The university I'm enrolled in requires two years of a foreign language. Ugh. Kill me. Kill me now. BUT with using Rosetta, I was not only able to get through my class without sounding and feeling like an idiot, but I actually got A's!! I never thought that was a possibility. I'm not ready to jet off to France or anything but I'm not a complete moron any more either. I'm calling that a huge success. Once this whole bachelors things is under my belt, I will be back to pick up the next level!!
------------------------------------------------
After preprocessing:  [{'index': 32, 'sentence': 'okay admit', 'rating': 1}, {'index': 32, 'sentence': 'one millions americans massively struggles language either english', 'rating': 1}, {'index': 32, 'sentence': 'recently returned college finish b

In [6]:
# Split Dataset into training and test
indexes = [i for i in range(len(reviews_data))]

Train_X_index, Test_X_index, Train_Y_review, Test_Y_review = model_selection.train_test_split(indexes,ratings,test_size=0.3,random_state=42)

df = pd.DataFrame(preprocessed_data)

train = df[df['index'].isin(Train_X_index)]
Train_Y = train['rating'].tolist()
Train_X = train['sentence'].tolist()
Train_index = train['index'].tolist()


test = df[df['index'].isin(Test_X_index)]
Test_Y = test['rating'].tolist()
Test_X = test['sentence'].tolist()
Test_index = test['index'].tolist()

all_sentences = Train_X + Test_X

In [7]:
len(df)

1172750

In [8]:
# Encoding of label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Train_Y_review = Encoder.fit_transform(Train_Y_review)
Test_Y_review = Encoder.fit_transform(Test_Y_review)

In [9]:
# Word Vectorization
        
Hashing_vect = HashingVectorizer(alternate_sign=False)
Hashing_vect.fit(all_sentences)
Train_X_Hashing = Hashing_vect.transform(Train_X)
Test_X_Hashing = Hashing_vect.transform(Test_X)

In [10]:
# Oversampling 
oversample = SMOTE(random_state=100)
X_SMOTE, y_SMOTE = oversample.fit_resample(Train_X_Hashing, Train_Y)

In [11]:
def find_Max_Element(scores):
    c = Counter(scores)
    negatives = c[0]
    neutral = c[1]
    positives = c[2]
    
    if(scores[0] == -1):
        negatives += 1
    elif(scores[0] == 0):
        neutral += 1
    elif(scores[0] == 1):
        positives += 1
    
    if(scores[-1] == -1):
        negatives += 1
    elif(scores[-1] == 0):
        neutral += 1
    elif(scores[-1] == 1):
        positives += 1

    if(neutral == positives and neutral > negatives and positives > negatives):
        max_element = 1
    elif (neutral == negatives and neutral > positives and negatives > positives):
        max_element = 1
    elif (neutral == negatives and negatives == positives):
        max_element = 1
    elif (positives == negatives and positives > neutral and negatives > neutral) :
        max_element = 1
    else:
        max_element = max(set(scores), key = scores.count)
        
    return max_element
#     return  max(set(scores), key = scores.count)

In [12]:
# Support Vector Machine with linear cernel

from sklearn.svm import LinearSVC
import timeit

start = timeit.default_timer()

# Train the classifier
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X_SMOTE,y_SMOTE)

stop = timeit.default_timer()

print("Training is done. Time: ", (stop-start)) 

predictions_SVM = clf.predict(Test_X_Hashing)

print("Predicting is done")

# Get grades by review
predicted_scores_frame = pd.DataFrame(np.column_stack([Test_index, predictions_SVM, Test_Y]), 
                               columns=['review Index', 'prediction', 'actual'])

print(len(predicted_scores_frame['review Index'].unique()) - len(Test_Y_review))

# for each review get all scores by review
scores_by_review_frame = predicted_scores_frame.groupby('review Index')['prediction'].apply(list)

majority_scores =  scores_by_review_frame.apply(find_Max_Element)
predicted_scores = list(majority_scores)

# for each review get its actual score
actual_scores_frame = predicted_scores_frame.groupby('review Index')['actual'].first()
actual_scores = list(actual_scores_frame)

# get all indexes
review_indexes = predicted_scores_frame['review Index'].unique()

# Calculate Accuracy
accuracy = accuracy_score(predicted_scores, actual_scores)*100
print("Accuracy: ", accuracy)
print("-----------------------")

# Confusion Matrix
cm = confusion_matrix(actual_scores, predicted_scores)
print("Confusion matrix: ", cm)
print("------------------------")

# Classification Report
my_tags = ['Negative','Neutral','Positive']
print(classification_report(actual_scores, predicted_scores, target_names=my_tags))
report = classification_report(actual_scores, predicted_scores, target_names=my_tags, output_dict=True);
clsf_report = pd.DataFrame(report).transpose()
clsf_report.to_csv('reports/SVM_HashingVectorizer_Oversampling.csv', index= True)

Training is done. Time:  152.9611794
Predicting is done
-9
Accuracy:  72.17297685268258
-----------------------
Confusion matrix:  [[ 3258  3079  1660]
 [ 1545  4122  3510]
 [ 2336 11132 52953]]
------------------------
              precision    recall  f1-score   support

    Negative       0.46      0.41      0.43      7997
     Neutral       0.22      0.45      0.30      9177
    Positive       0.91      0.80      0.85     66421

    accuracy                           0.72     83595
   macro avg       0.53      0.55      0.53     83595
weighted avg       0.79      0.72      0.75     83595



In [13]:
# write indexes of false classified reviews to a txt file

data = {"review_index": review_indexes, "predict": predicted_scores, "actual": actual_scores}

review_dataframe = pd.DataFrame(data)

false_classified_indexes = review_dataframe['review_index'][review_dataframe['predict'] != review_dataframe['actual']]
false_classified_indexes = list(false_classified_indexes)

with open("indexes/SVM_HashingVectorizer_Oversampling.txt", 'w') as f:
    for item in false_classified_indexes:
        f.write("%s\n" % item)
        
# save those indexes who are predicted as negative but they are neutral
negativePredicted_neutralActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] == 0) & (review_dataframe['actual'] == 1)]
negativePredicted_neutralActual_indexes = list(negativePredicted_neutralActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_NegativePredicted_NeutralActual.txt", 'w') as f:
    for item in negativePredicted_neutralActual_indexes:
        f.write("%s\n" % item)

# save those indexes who are predicted as negative but they are positive
negativePredicted_positiveActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] == 0) & (review_dataframe['actual'] == 2)]
negativePredicted_positiveActual_indexes = list(negativePredicted_positiveActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_NegativePredicted_PositiveActual.txt", 'w') as f:
    for item in negativePredicted_positiveActual_indexes:
        f.write("%s\n" % item)
        
# save those indexes who are predicted as neutral but they are negative
neutralPredicted_negativeActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] ==1) & (review_dataframe['actual'] == 0)]
neutralPredicted_negativeActual_indexes = list(neutralPredicted_negativeActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_NeutralPredicted_NegativeActual.txt", 'w') as f:
    for item in neutralPredicted_negativeActual_indexes:
        f.write("%s\n" % item)

# save those indexes who are predicted as neutral but they are positive
neutralPredicted_positiveActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] == 1) & (review_dataframe['actual'] == 2)]
neutralPredicted_positiveActual_indexes = list(neutralPredicted_positiveActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_NeutralPredicted_PositiveActual.txt", 'w') as f:
    for item in neutralPredicted_positiveActual_indexes:
        f.write("%s\n" % item)

# save those indexes who are predicted as positive but they are negative        
positivePredicted_negativeActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] ==2) & (review_dataframe['actual'] == 0)]
positivePredicted_negativeActual_indexes = list(positivePredicted_negativeActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_PositivePredicted_NegativeActual.txt", 'w') as f:
    for item in positivePredicted_negativeActual_indexes:
        f.write("%s\n" % item)
 
# save those indexes who are predicted as positive but they are neutral 
positivePredicted_neutralActual_indexes = review_dataframe['review_index'][(review_dataframe['predict'] ==2) & (review_dataframe['actual'] == 1)]
positivePredicted_neutralActual_indexes = list(positivePredicted_neutralActual_indexes)
with open("indexes/SVM_HashingVectorizer_Oversampling_PositivePredicted_NeutralActual.txt", 'w') as f:
    for item in positivePredicted_neutralActual_indexes:
        f.write("%s\n" % item)

In [14]:
review_dataframe.head()

Unnamed: 0,review_index,predict,actual
0,4,2,2
1,6,2,2
2,11,2,2
3,12,2,2
4,22,2,2
