In [1]:
!pip install wordcloud



In [1]:
import os
import json
import gzip
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import re
import string

from tqdm import tqdm
from matplotlib import pyplot as plt
from urllib.request import urlopen
from numpy.linalg import norm
from collections import defaultdict
from math import sqrt


from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn import neighbors

from scipy.spatial.distance import cosine

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_curve

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from itertools import chain
from wordcloud import WordCloud, STOPWORDS
from fractions import Fraction

# default plot configurations 
%matplotlib inline 
plt.rcParams['figure.figsize'] = (16,8)
plt.rcParams['figure.dpi'] = 150
sns.set()

### Loading Data

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
        if i % 500000 == 0: print(i)
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
print("Start loading review data")
review_data = getDF(r'C:\Users\Xylon\Desktop\data200_grad\data\Toys_and_Games.json.gz')
print("Finish loading review data")

# total length of list, this number equals total number of products
print(len(review_data))

Start loading review data
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
Finish loading review data
8201231


In [4]:
review_data.head(5)

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,image,style
0,2.0,12.0,False,"09 22, 2016",A1IDMI31WEANAF,20232233,Mackenzie Kent,"When it comes to a DM's screen, the space on t...",The fact that 50% of this space is wasted on a...,1474502400,,
1,1.0,21.0,False,"09 18, 2016",A4BCEVVZ4Y3V3,20232233,Jonathan Christian,An Open Letter to GaleForce9*:\n\nYour unpaint...,Another worthless Dungeon Master's screen from...,1474156800,,
2,3.0,19.0,True,"09 12, 2016",A2EZ9PY1IHHBX0,20232233,unpreparedtodie,"Nice art, nice printing. Why two panels are f...","pretty, but also pretty useless",1473638400,,
3,5.0,,True,"03 2, 2017",A139PXTTC2LGHZ,20232233,Ashley,Amazing buy! Bought it as a gift for our new d...,Five Stars,1488412800,,
4,1.0,3.0,True,"02 8, 2017",A3IB33V29XIL8O,20232233,Oghma_EM,As my review of GF9's previous screens these w...,Money trap,1486512000,,


### Data Cleaning

Technically, a user can only make 1 review at a reviewTime for a product. Therefore, we remove the duplicates which share the same reviewerID, product asin, and unixReviewTime.

In [5]:
review_data.drop_duplicates(subset=['reviewerID', 'asin', 'unixReviewTime'], inplace=True)

Extract useful columns.

In [6]:
review_data = review_data[['overall', 'reviewerID', 'asin', 'summary']]

Fill in 0 and " " for NaN values in 'vote', 'reviewText', and 'summary'. Drop the remaining NaN values.

In [7]:
review_data['summary'] = review_data['summary'].fillna('')

In [8]:
review_data = review_data.dropna()

In [9]:
print(len(review_data))

8002579


In [10]:
review_data.head(5)

Unnamed: 0,overall,reviewerID,asin,summary
0,2.0,A1IDMI31WEANAF,20232233,The fact that 50% of this space is wasted on a...
1,1.0,A4BCEVVZ4Y3V3,20232233,Another worthless Dungeon Master's screen from...
2,3.0,A2EZ9PY1IHHBX0,20232233,"pretty, but also pretty useless"
3,5.0,A139PXTTC2LGHZ,20232233,Five Stars
4,1.0,A3IB33V29XIL8O,20232233,Money trap


### Data Preprocessing

Select reviews of products that have more than 50 reviewers.

In [23]:
count = review_data.groupby("asin", as_index=False).count()

df_merge = pd.merge(review_data, count, how='right', on=['asin'])

df_merge = df_merge.rename(columns={"overall_x": "overall", "summary_x": "summary", "reviewerID_y": "numReviewer"})

df_merge = df_merge.sort_values(by='numReviewer', ascending=False)
df_50 = df_merge[df_merge['numReviewer'] >= 50]

df_50 = df_50[['overall', 'asin', 'summary', 'numReviewer']]

In [24]:
df_50.head(5)

Unnamed: 0,overall,asin,summary,numReviewer
2343547,5.0,B004S8F7QM,Five Stars,8815
2345034,5.0,B004S8F7QM,Must have for every party or adult game night.,8815
2345032,1.0,B004S8F7QM,Trash!,8815
2345031,5.0,B004S8F7QM,Five Stars,8815
2345030,5.0,B004S8F7QM,Expect to laugh a lot,8815


In [25]:
len(df_50)

4843224

Grouping all the summary reviews by product ID into lists

In [28]:
summary_product = df_50.groupby("asin")["summary"].apply(list)
df_summary_product = pd.DataFrame(summary_product)

In [29]:
df_summary_product.head(5)

Unnamed: 0_level_0,summary
asin,Unnamed: 1_level_1
486448789,"[Great idea..., good, but not a good price, Sh..."
545561647,"[Three Stars, they love them, but it's a ton o..."
615638996,"[Four Stars, Great product, Five Stars, Worth ..."
692770445,"[The egg is a great teaching idea, The 7-year-..."
735333467,"[Super cute but not durable, So cute!, At firs..."


Append the average overall rating for each product

In [30]:
df_mean = review_data.groupby("asin", as_index=False).mean()

In [50]:
df = pd.merge(df_summary_product, df_mean, on="asin", how='inner')
df = df[['asin','summary','overall']]

In [51]:
df['summary'] = df['summary'].astype(str)

In [52]:
df.head(5)

Unnamed: 0,asin,summary,overall
0,486448789,"['Great idea...', 'good, but not a good price'...",3.85567
1,545561647,"['Three Stars', ""they love them, but it's a to...",3.950739
2,615638996,"['Four Stars', 'Great product', 'Five Stars', ...",4.651515
3,692770445,"['The egg is a great teaching idea', 'The 7-ye...",4.294872
4,735333467,"['Super cute but not durable', 'So cute!', 'At...",4.508475


Preprocessing the summary

In [53]:
# tokenizer
regEx = re.compile('[^a-z]+')
def cleanReviews(reviewText):
    reviewText = reviewText.lower()
    reviewText = regEx.sub(' ', reviewText).strip()
    return reviewText

In [54]:
#reset index and drop duplicate rows
df["summaryClean"] = df["summary"].apply(cleanReviews)

In [55]:
df.head(5)

Unnamed: 0,asin,summary,overall,summaryClean
0,486448789,"['Great idea...', 'good, but not a good price'...",3.85567,great idea good but not a good price should be...
1,545561647,"['Three Stars', ""they love them, but it's a to...",3.950739,three stars they love them but it s a ton of d...
2,615638996,"['Four Stars', 'Great product', 'Five Stars', ...",4.651515,four stars great product five stars worth the ...
3,692770445,"['The egg is a great teaching idea', 'The 7-ye...",4.294872,the egg is a great teaching idea the year old ...
4,735333467,"['Super cute but not durable', 'So cute!', 'At...",4.508475,super cute but not durable so cute at first i ...


### Feature Extraction

In [63]:
reviews = df["summaryClean"] 

In [64]:
# might be able to use TfIdf tokenizer
countVector = CountVectorizer(max_features = 300, stop_words='english') 
transformed_reviews = countVector.fit_transform(reviews) 

In [65]:
df_feature = pd.DataFrame(transformed_reviews.A, columns=countVector.get_feature_names())
df_feature = df_feature.astype(int)

In [66]:
df_feature.head(5)

Unnamed: 0,absolutely,actually,addition,adorable,adults,advertised,age,ages,amazing,amazon,...,work,worked,working,works,worth,wrong,year,years,young,yr
0,0,0,1,0,0,1,1,0,0,0,...,0,1,0,0,0,0,4,0,0,1
1,0,0,0,9,1,0,2,1,0,0,...,1,0,0,1,3,0,7,0,0,1
2,0,0,1,0,0,0,1,1,0,0,...,3,0,1,2,3,0,2,0,0,1
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,2,1,0,0
4,2,0,0,2,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0


### Train-test split

In [73]:
X = np.array(df_feature)

train_size = 0.8
tsize = int(np.floor(train_size * len(df_feature)))
X_train = X[:tsize]
X_test = X[tsize:]

print("Length of training set:", len(X_train))
print("Length of test set:", len(X_test))

Length of training set: 23846
Length of test set: 5962


### Recommendation System (KNN)

In [74]:
knn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X_train)

In [75]:
# find most related products for the first 20 products
for i in range(20):
    a = knn.kneighbors([X_test[i]])
    related_product_list = a[1]

    first_related_product = [item[0] for item in related_product_list]
    first_related_product = str(first_related_product).strip('[]')
    first_related_product = int(first_related_product)
    second_related_product = [item[1] for item in related_product_list]
    second_related_product = str(second_related_product).strip('[]')
    second_related_product = int(second_related_product)
    
    print ("Based on product reviews, for ", df["asin"][len(X_train) + i] ," average rating is ",df["overall"][len(X_train) + i])
    print ("The first similar product is ", df["asin"][first_related_product] ," average rating is ",df["overall"][first_related_product])
    print ("The second similar product is ", df["asin"][second_related_product] ," average rating is ",df["overall"][second_related_product])
    print ("-----------------------------------------------------------")

Based on product reviews, for  B00WXYNLYS  average rating is  4.796296296296297
The first similar product is  B00EZIKSZK  average rating is  4.96969696969697
The second similar product is  B00F14IHO6  average rating is  4.515625
-----------------------------------------------------------
Based on product reviews, for  B00WXYNS2S  average rating is  4.71830985915493
The first similar product is  B00R8ZVPVS  average rating is  4.783333333333333
The second similar product is  B00DR7T8W4  average rating is  4.672727272727273
-----------------------------------------------------------
Based on product reviews, for  B00WXYNSJQ  average rating is  4.633333333333334
The first similar product is  B00I3MOU58  average rating is  4.69811320754717
The second similar product is  B00AZP3ZGG  average rating is  4.773584905660377
-----------------------------------------------------------
Based on product reviews, for  B00WZU720Y  average rating is  4.26530612244898
The first similar product is  B0078Z

### Predict overall rating using KNN

#### n_neighbors=3, algorithm='ball_tree'

In [85]:
y_train = df["overall"][:len(X_train)]
y_test = df["overall"][len(X_train):]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [87]:
n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='ball_tree')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.50      0.56      0.53        18
           2       0.78      0.43      0.55       226
           3       0.65      0.38      0.48      1340
           4       0.84      0.96      0.89      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.55      0.46      0.49      5962
weighted avg       0.79      0.81      0.79      5962



#### n_neighbors=3, algorithm='brute'

In [88]:
n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='brute')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.50      0.56      0.53        18
           2       0.79      0.42      0.55       226
           3       0.65      0.37      0.47      1340
           4       0.83      0.96      0.89      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.55      0.46      0.49      5962
weighted avg       0.79      0.81      0.78      5962



#### n_neighbors=3, algorithm='KD_Tree'

In [89]:
n_neighbors = 3
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='kd_tree')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.50      0.56      0.53        18
           2       0.78      0.43      0.56       226
           3       0.65      0.37      0.47      1340
           4       0.83      0.96      0.89      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.55      0.46      0.49      5962
weighted avg       0.79      0.81      0.79      5962



#### n_neighbors=5, algorithm='ball_tree'

In [90]:
n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='ball_tree')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.53      0.44      0.48        18
           2       0.81      0.42      0.55       226
           3       0.67      0.34      0.45      1340
           4       0.83      0.97      0.90      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.57      0.44      0.48      5962
weighted avg       0.79      0.81      0.78      5962



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### n_neighbors=5, algorithm='brute'

In [91]:
n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='brute')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.53      0.44      0.48        18
           2       0.80      0.42      0.55       226
           3       0.67      0.35      0.46      1340
           4       0.83      0.97      0.90      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.57      0.44      0.48      5962
weighted avg       0.79      0.81      0.78      5962



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### n_neighbors=5, algorithm='KD_Tree'

In [92]:
n_neighbors = 5
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance', algorithm='kd_tree')
knnclf.fit(X_train, y_train)
y_test_pred = knnclf.predict(X_test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.53      0.44      0.48        18
           2       0.81      0.42      0.55       226
           3       0.67      0.34      0.45      1340
           4       0.83      0.97      0.90      4376
           5       0.00      0.00      0.00         2

    accuracy                           0.81      5962
   macro avg       0.57      0.44      0.48      5962
weighted avg       0.79      0.81      0.78      5962



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
