In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# <center><u>IMPORTS</u></center>

In [2]:
import json
import string
import xgboost
import numpy as np
import pandas as pd
from sklearn import svm
from keras import layers
from sklearn import metrics
from sklearn import ensemble
from keras import optimizers
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import decomposition
from keras.models import Sequential
from sklearn import model_selection 
from keras.preprocessing import text
from keras.preprocessing import sequence
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


# <center><u>DATA LOADING</u></center>

In [3]:
input_file = "Clothing_Shoes_and_Jewelry_5.json"

In [4]:
input_json = open(input_file, "r", encoding="utf-8" )

In [5]:
import csv
output_file = "reviews_Clothing_Shoes_and_Jewelry_5.csv"
with open(output_file, "w", encoding="utf-8") as output_csv:
    csv_writer = csv.writer(output_csv)
    flag = 0
    for line in input_json.readlines():
        dic = json.loads(line)
        # writing headline in the beginning
        if flag == 0:
            csv_writer.writerow(dic)
            flag = 1
        csv_writer.writerow(dic.values())

In [6]:
input_data = pd.read_csv("reviews_Clothing_Shoes_and_Jewelry_5.csv")

In [7]:
input_data.keys()

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [8]:
input_data.head(2).T

Unnamed: 0,0,1
reviewerID,A1KLRMWW2FWPL4,A2G5TCU2WDFZ65
asin,0000031887,0000031887
reviewerName,"Amazon Customer ""cameramom""",Amazon Customer
helpful,"[0, 0]","[0, 0]"
reviewText,This is a great tutu and at a really great pri...,I bought this for my 4 yr old daughter for dan...
overall,5.0,5.0
summary,Great tutu- not cheaply made,Very Cute!!
unixReviewTime,1297468800,1358553600
reviewTime,"02 12, 2011","01 19, 2013"


In [9]:
use_df = input_data[['reviewText','overall']]

In [10]:
use_df.head(2)

Unnamed: 0,reviewText,overall
0,This is a great tutu and at a really great pri...,5.0
1,I bought this for my 4 yr old daughter for dan...,5.0


# <center><u>DATA PREPROCESSING</u></center>

In [11]:
use_df = use_df.dropna()

In [12]:
use_df.isna().sum()

reviewText    0
overall       0
dtype: int64

In [13]:
len(use_df)

278653

In [14]:
use_df.dtypes

reviewText    object
overall       object
dtype: object

In [15]:
use_df["overall"].value_counts()[:10]

5.0            162939
4.0             58270
3.0             30385
2.0             15439
1.0             11168
Five Stars         29
Four Stars          8
One Star            5
Three Stars         3
Beautiful!          3
Name: overall, dtype: int64

In [16]:
use_df["overall"][use_df["overall"] == "Five Stars"] = "5.0"
use_df["overall"][use_df["overall"] == "Four Stars"] = "4.0"
use_df["overall"][use_df["overall"] == "Three Stars"] = "3.0"
use_df["overall"][use_df["overall"] == "Two Stars"] = "2.0"
use_df["overall"][use_df["overall"] == "One Star"] = "1.0"

In [17]:
use_df["overall"].value_counts()[:10]

5.0           162968
4.0            58278
3.0            30388
2.0            15440
1.0            11173
Beautiful!         3
Nice               3
Very happy         2
gift               2
Too big            2
Name: overall, dtype: int64

In [18]:
use_df = use_df[use_df["overall"].isin(["5.0","4.0","3.0","2.0","1.0"])]

In [19]:
use_df["overall"].value_counts()[:10]

5.0    162968
4.0     58278
3.0     30388
2.0     15440
1.0     11173
Name: overall, dtype: int64

In [20]:
len(use_df)

278247

In [21]:
use_df["overall"] = use_df["overall"].apply(pd.to_numeric)

In [22]:
use_df.dtypes

reviewText     object
overall       float64
dtype: object

In [23]:
len(use_df)

278247

In [24]:
use_df = use_df[use_df["overall"] != 3]

In [25]:
len(use_df)

247859

In [26]:
use_df["label"] = use_df["overall"].apply(lambda rating : +1 if rating > 3 else -1)

In [27]:
use_df["label"].unique()

array([ 1, -1])

In [28]:
len(use_df)

247859

In [29]:
use_df.head(2)

Unnamed: 0,reviewText,overall,label
0,This is a great tutu and at a really great pri...,5.0,1
1,I bought this for my 4 yr old daughter for dan...,5.0,1


# <center><u>DATA MODELLING</u></center>

### Logistic Regression

In [30]:
X = pd.DataFrame(use_df, columns = ["reviewText"])
y = pd.DataFrame(use_df, columns = ["label"])

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=50)

In [31]:
train_y[:5]

Unnamed: 0,label
162297,-1
249034,-1
77299,1
214257,1
90418,1


In [32]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_vector = vectorizer.fit_transform(train_X["reviewText"])
test_vector = vectorizer.transform(test_X["reviewText"])

In [33]:
type(train_vector)

scipy.sparse.csr.csr_matrix

In [34]:
%%time
clr = LogisticRegression()
clr.fit(train_vector, train_y.values.ravel())
scores = clr.score(test_vector, test_y) # accuracy
print(scores)

0.9443072702331962
CPU times: user 1min 4s, sys: 92 ms, total: 1min 4s
Wall time: 1min 4s


In [35]:
clr.predict(test_vector)[:10]

array([ 1,  1,  1,  1, -1,  1, -1,  1,  1,  1])

In [36]:
test_y.head(10).T

Unnamed: 0,227701,108133,36082,215135,123532,140478,9449,155488,277462,183549
label,1,1,1,1,-1,1,-1,1,1,1


In [37]:
test_X.head()

Unnamed: 0,reviewText
227701,It is exactly what I wanted exactly what they ...
108133,I literally just took this bra out of the pack...
36082,"Mine came missing one side of the top button, ..."
215135,I LOVE this coat and receive compliments all t...
123532,You get what you pay for. I already own a pair...


In [38]:
input_text = test_X.reset_index().iloc[0]

In [39]:
input_text = input_text["reviewText"]

In [40]:
input_text

'It is exactly what I wanted exactly what they described. Looks lovely on and fits wonderful. I love this shirt.'

In [41]:
output_val = test_y.reset_index().iloc[0]

In [42]:
output_val = output_val["label"]

In [43]:
output_val

1

In [44]:
input_vector = vectorizer.transform(test_X.head())

In [45]:
print(input_vector)




In [46]:
len(clr.predict(input_vector))

1

### Keras

### The following keras model has been trained in crestle.ai cloud using GPU

In [47]:
X = pd.DataFrame(use_df, columns = ["reviewText"])
y = pd.DataFrame(use_df, columns = ["label"])

train_X, test_X, train_y, test_y = train_test_split(X, y,test_size = 0.15,random_state=50)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size = 0.15,random_state=50)

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_vector = vectorizer.fit_transform(train_X["reviewText"])
val_vector = vectorizer.transform(val_X["reviewText"])
test_vector = vectorizer.transform(test_X["reviewText"])

input_dim = train_vector.shape[1]  # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [48]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [49]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                631680    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 631,691
Trainable params: 631,691
Non-trainable params: 0
_________________________________________________________________


In [51]:
history = model.fit(train_vector, train_y,epochs=10,verbose=True,validation_data=(val_vector, val_y),batch_size=1024)

Train on 179078 samples, validate on 31602 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [52]:
loss, accuracy = model.evaluate(train_vector, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_vector, test_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.8313
Testing Accuracy:  0.8207
