In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.utils import resample
from string import punctuation
import re
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
reviews_raw = pd.read_json('reviews_Sports_And_Outdoors_5.JSON', lines=True)
pd.DataFrame.from_dict(reviews_raw, orient='columns')
reviews_raw.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1881509818,"[0, 0]",5,This came in on time and I am veru happy with ...,"01 26, 2014",AIXZKN4ACSKI,David Briner,Woks very good,1390694400
1,1881509818,"[1, 1]",5,I had a factory Glock tool that I was using fo...,"02 2, 2012",A1L5P841VIO02V,Jason A. Kramer,Works as well as the factory tool,1328140800
2,1881509818,"[2, 2]",4,If you don't have a 3/32 punch or would like t...,"02 28, 2012",AB2W04NI4OEAD,J. Fernald,"It's a punch, that's all.",1330387200
3,1881509818,"[0, 0]",4,This works no better than any 3/32 punch you w...,"02 5, 2012",A148SVSWKTJKU6,"Jusitn A. Watts ""Maverick9614""",It's a punch with a Glock logo.,1328400000
4,1881509818,"[0, 0]",4,I purchased this thinking maybe I need a speci...,"04 23, 2013",AAAWJ6LW9WMOO,Material Man,"Ok,tool does what a regular punch does.",1366675200
5,1881509818,"[0, 0]",5,"Needed this tool to really break down my G22, ...","11 2, 2012",A2XX2A4OJCDNLZ,RatherLiveInKeyWest,Glock punch tool - needed for your Glock and o...,1351814400
6,1881509818,"[0, 0]",5,If u don't have it .. Get it. All you need to ...,"06 10, 2014",A283UOBQRUNM4Q,Thomas Dragon,Great tool,1402358400
7,2094869245,"[0, 0]",4,This light will no doubt capture the attention...,"08 31, 2013",AWG3H90WVZ0Z1,Alec Nelson,Bright!,1377907200
8,2094869245,"[0, 1]",5,"Light and laser torch work well, very bright. ...","05 27, 2013",A3V52OTJHKIJZX,"A. Saenz Jr. ""Bettering self""",Be seen,1369612800
9,2094869245,"[0, 0]",5,Does everything it says it will do. I would li...,"11 2, 2013",A3SZBE5F3UQ9EC,"ChasRat ""ChasRat""",Bicycle rear tail light,1383350400


In [3]:
null_count = reviews_raw.isnull().sum()
null_count[null_count>0]

reviewerName    1402
dtype: int64

In [4]:
df2 = reviews_raw.drop(['reviewerID', 'reviewerName', 'asin', 'helpful', 'reviewTime', 'summary', 
                       'unixReviewTime'], 1)
threshold = 4
df2['Sentiment'] = np.where(df2['overall'] >= threshold, 1,0)
df2 = df2.drop(['overall'], 1)

df2.head(10)

Unnamed: 0,reviewText,Sentiment
0,This came in on time and I am veru happy with ...,1
1,I had a factory Glock tool that I was using fo...,1
2,If you don't have a 3/32 punch or would like t...,1
3,This works no better than any 3/32 punch you w...,1
4,I purchased this thinking maybe I need a speci...,1
5,"Needed this tool to really break down my G22, ...",1
6,If u don't have it .. Get it. All you need to ...,1
7,This light will no doubt capture the attention...,1
8,"Light and laser torch work well, very bright. ...",1
9,Does everything it says it will do. I would li...,1


In [5]:
df2['Sentiment'].value_counts()

1    253017
0     43320
Name: Sentiment, dtype: int64

In [6]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

for review in df2['reviewText']:
    review = text_cleaner(review)

df2.head(10)

Unnamed: 0,reviewText,Sentiment
0,This came in on time and I am veru happy with ...,1
1,I had a factory Glock tool that I was using fo...,1
2,If you don't have a 3/32 punch or would like t...,1
3,This works no better than any 3/32 punch you w...,1
4,I purchased this thinking maybe I need a speci...,1
5,"Needed this tool to really break down my G22, ...",1
6,If u don't have it .. Get it. All you need to ...,1
7,This light will no doubt capture the attention...,1
8,"Light and laser torch work well, very bright. ...",1
9,Does everything it says it will do. I would li...,1


In [7]:
positive = df2.loc[df2['Sentiment'] == 1]
negative = df2.loc[df2['Sentiment'] == 0]
df3 = resample(positive, n_samples=43320)
negative = resample(negative, n_samples=43320)
df3 = df3.append(negative)
df3['Sentiment'].value_counts()

1    43320
0    43320
Name: Sentiment, dtype: int64

In [8]:
X = df3['reviewText']
y = df3['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
vect2 = TfidfVectorizer(min_df=20, analyzer = 'word', stop_words = 'english', ngram_range = (1,2) ).fit(X_train)

X_train_vectorized = vect2.transform(X_train)
X_test_vectorized = vect2.transform(X_test)


In [9]:
def testing_func(model):
    
    model.fit(X_train_vectorized, y_train)
    model.fit(X_test_vectorized, y_test)

    # Inspect the results.
    print('\nR-squared for the Training Set:')
    print(model.score(X_train_vectorized, y_train))
    print('\nR-squared for the Test Set with PCA:')
    print(model.score(X_test_vectorized, y_test))
    print('Cross Validation Scores with 10 folds for PCA: {}'.format(cross_val_score(model, X_train_vectorized, y_train, cv=10)))

In [10]:
start = datetime.now()
#Let's put our variables through Naive Bayes.
bnb = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

testing_func(bnb)
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the Training Set:
0.735684261301

R-squared for the Test Set with PCA:
0.778609401231
Cross Validation Scores with 10 folds for PCA: [ 0.72868217  0.73970715  0.73936262  0.74074074  0.75021533  0.73488372
  0.75159345  0.73936262  0.73608958  0.73255213]
It took the following time to complete this task: 0:00:01.821072


In [11]:
start = datetime.now()
#Let's pop those into the model
lr = LogisticRegression(C=1, penalty='l1')
testing_func(lr)

print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the Training Set:
0.789363974642

R-squared for the Test Set with PCA:
0.824986010073
Cross Validation Scores with 10 folds for PCA: [ 0.80861326  0.79534884  0.81171404  0.80551249  0.80999139  0.81343669
  0.8124031   0.79310939  0.80017227  0.79958642]
It took the following time to complete this task: 0:00:19.646285


In [12]:
start = datetime.now()
#Let's pop those into the model
lr = LogisticRegression(C=1, penalty='l2')
testing_func(lr)

print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the Training Set:
0.80092337376

R-squared for the Test Set with PCA:
0.860695299384
Cross Validation Scores with 10 folds for PCA: [ 0.82411714  0.80620155  0.82222222  0.81567614  0.81929371  0.81533161
  0.82239449  0.8089578   0.81360896  0.80940893]
It took the following time to complete this task: 0:00:16.423998


In [13]:
start = datetime.now()
clf = DecisionTreeClassifier(max_depth=8, random_state=0)
testing_func(clf)

print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the Training Set:
0.648342750827

R-squared for the Test Set with PCA:
0.660534415221
Cross Validation Scores with 10 folds for PCA: [ 0.65185185  0.63927649  0.64857881  0.64788975  0.65770887  0.65099053
  0.65822567  0.64633936  0.64651163  0.63846286]
It took the following time to complete this task: 0:01:18.690858


In [14]:
start = datetime.now()
#Now let's model with Gradient Boosting
clf = GradientBoostingClassifier(loss='deviance', learning_rate=1.0, n_estimators=10, subsample=1.0, 
                                          criterion='friedman_mse', min_samples_split=4, min_samples_leaf=1, 
                                          min_weight_fraction_leaf=0.0, max_depth=2, min_impurity_decrease=0.0, 
                                          min_impurity_split=None, init=None, random_state=None, max_features=None, 
                                          verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')

testing_func(clf)
print('It took the following time to complete this task:', datetime.now() - start)


R-squared for the Training Set:
0.671392640573

R-squared for the Test Set with PCA:
0.678616396195
Cross Validation Scores with 10 folds for PCA: [ 0.66287683  0.65219638  0.66046512  0.65925926  0.67459087  0.66907838
  0.66925065  0.66080965  0.66356589  0.64759607]
It took the following time to complete this task: 0:02:46.697318


Althought Bernoulli Bayes works fairly well, Ridge and Lasso Regression performed the best of all the models with around 0.80 Accuracy with similar scores on the Cross Validation/Test meaning that it likely isn't overfitting the data.