# Amazon Sentiment Analysis

In [91]:
import pandas as pd
import gzip
import string
import numpy as np

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('reviews_Automotive_5.json.gz')

In [3]:
print(df.head())
print(df.shape)
print(df.columns)

       reviewerID        asin     reviewerName   helpful  \
0  A3F73SC1LY51OO  B00002243X  Alan Montgomery    [4, 4]   
1  A20S66SKYXULG2  B00002243X         alphonse    [1, 1]   
2  A2I8LFSN2IS5EO  B00002243X            Chris    [0, 0]   
3  A3GT2EWQSO45ZG  B00002243X           DeusEx  [19, 19]   
4  A3ESWJPAVRPWB4  B00002243X     E. Hernandez    [0, 0]   

                                          reviewText  overall  \
0  I needed a set of jumper cables for my new car...      5.0   
1  These long cables work fine for my truck, but ...      4.0   
2  Can't comment much on these since they have no...      5.0   
3  I absolutley love Amazon!!!  For the price of ...      5.0   
4  I purchased the 12' feet long cable set and th...      5.0   

                                      summary  unixReviewTime   reviewTime  
0  Work Well - Should Have Bought Longer Ones      1313539200  08 17, 2011  
1                            Okay long cables      1315094400   09 4, 2011  
2                

In order to set up a sentiment analysis the first step we need to do is cleaning the text to perform a better analysis on it.  Let's try to write a function to perform some of the initial scrubbing.

In [4]:
def clean_text(series):
    '''Cleaning text for sentiment analysis'''
    
    clean = series
    
    # Make everything lowercase
    clean = clean.apply(lambda x: x.lower())
    
    # Remove punctuation - the None says that we did not add any information to 'translate' (replace)
    # the string.punctuation provides a list of all the possible punctuation we want to remove
    clean = clean.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
    
    # Remove numbers
    clean = clean.apply(lambda x: x.translate(str.maketrans('','', string.digits)))
    
    # Remove a few common stop words
    stop_words = ['a','able','about','across','after','all','almost','also','am','among','an','and','any','are',
                  'as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either',
                  'else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how',
                  'however','i','im','if','in','into','is','it','its','ive','just','least','let','like','likely','may','me',
                  'might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own',
                  'rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then',
                  'there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where',
                  'which','while','who','whom','why','will','with','would','yet','you','your']
    clean = clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    # use the imported stop words from sklearn
    stop = text.ENGLISH_STOP_WORDS
    clean = clean.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    return clean

In [5]:
df['reviewText'] = clean_text(df.reviewText)
df['summary'] = clean_text(df.summary)
print(df[['reviewText', 'summary']])

                                              reviewText  \
0      needed set jumper cables new car good reviews ...   
1      long cables work fine truck quality little sha...   
2      comment used come update review issues use bui...   
3      absolutley love amazon price set cheap booster...   
4      purchased feet long cable set arrived retail c...   
5      jumper cables heavy duty easy store containers...   
6      bought k suburban plenty length rear ended usi...   
7      good motorized vehicles running semi farm equi...   
8      coleman cable feet heavyduty truck auto batter...   
9      old car bound need set gave old ones boss truc...   
10     use jumper cables time year usually elses car ...   
11     jumper cables real jumper cables jumper cables...   
12     guys stops helps people need pro recovery guy ...   
13     arent best cables buy youll electrical welding...   
14     hard pure copper cabled jumper cables add gave...   
15     insurance policy land rover ounce

It looks like the summary data is much cleaner - so we will use that for the initial training of the model.  What the countvectorizer is doing is 'tokenizing the documents and count the occurrences of token and returning them as a sparse matrix'. Essentially, getting a frequency count for the words left in the dataset, returning a count matrix.

In [6]:
cv = CountVectorizer()
cv.fit(df.summary)
X = cv.transform(df.summary)
print(X.shape)

(20473, 6735)


In [22]:
fullwords = pd.DataFrame(list(zip(cv.get_feature_names(), sum(X.toarray()))), 
             columns=['Word','Frequency']).sort_values(by='Frequency', ascending=False)

In [7]:
# split up data
X_train, X_test, Y_train, Y_test = train_test_split(X, df.overall, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(14331, 6735)
(6142, 6735)
(14331,)
(6142,)


In [93]:
def baseline_model(clf, x, y, crossval=False, cv=5, cm=False):
    # Run baseline regression
    fit = clf.fit(x, y)

    # Predict the model
    pred_y = clf.predict(x)
    
    if crossval == True:
        # Average score
        avscore = cross_val_score(clf, x, y, cv=cv)
        print('Baseline Model has a average cv score of', round(avscore.mean(), 2) * 100,"% accuracy score")
    
    score = clf.score(x,y)
    print('Baseline Model has a score of: {}'.format(score))
    
    # Get the Precision, Recall, and F1 score
    print(classification_report(y, pred_y))
    
    if cm==True:
        print('Confusion Matrix:')
        print(confusion_matrix(y,pred_y))
    
    return fit

In [94]:
# Logistic Regression
lr = baseline_model(LogisticRegression(), X_train, Y_train, crossval=True)

Baseline Model has a average cv score of 70.0 % accuracy score
Baseline Model has a score of: 0.7547275137813132
              precision    recall  f1-score   support

         1.0       0.91      0.28      0.43       370
         2.0       0.87      0.19      0.32       409
         3.0       0.77      0.34      0.47      1000
         4.0       0.76      0.23      0.35      2771
         5.0       0.75      0.99      0.85      9781

   micro avg       0.75      0.75      0.75     14331
   macro avg       0.81      0.41      0.48     14331
weighted avg       0.76      0.75      0.70     14331



In [74]:
# Naive Bayes
nb = baseline_model(BernoulliNB(), X_train, Y_train, crossval=True)

Baseline Model has a average cv score of 68.0 % accuracy score
Baseline Model has a score of: 0.7191403251692136
              precision    recall  f1-score   support

         1.0       1.00      0.00      0.01       370
         2.0       1.00      0.01      0.01       409
         3.0       0.72      0.08      0.14      1000
         4.0       0.64      0.19      0.30      2771
         5.0       0.72      0.99      0.84      9781

   micro avg       0.72      0.72      0.72     14331
   macro avg       0.82      0.25      0.26     14331
weighted avg       0.72      0.72      0.64     14331



In [75]:
# Decision Tree
dt = baseline_model(DecisionTreeClassifier(max_features='sqrt', max_depth=10), X_train, Y_train, crossval=True)

Baseline Model has a average cv score of 68.0 % accuracy score
Baseline Model has a score of: 0.68571627939432
              precision    recall  f1-score   support

         1.0       1.00      0.01      0.02       370
         2.0       0.88      0.02      0.03       409
         3.0       0.62      0.01      0.03      1000
         4.0       0.78      0.01      0.02      2771
         5.0       0.69      1.00      0.81      9781

   micro avg       0.69      0.69      0.69     14331
   macro avg       0.79      0.21      0.18     14331
weighted avg       0.71      0.69      0.56     14331



In [76]:
rfc = baseline_model(RandomForestClassifier(max_features='sqrt', max_depth=50), X_train, Y_train, crossval=True)

Baseline Model has a average cv score of 69.0 % accuracy score
Baseline Model has a score of: 0.7155118275068034
              precision    recall  f1-score   support

         1.0       0.95      0.16      0.27       370
         2.0       1.00      0.12      0.22       409
         3.0       0.91      0.18      0.30      1000
         4.0       0.94      0.07      0.13      2771
         5.0       0.71      1.00      0.83      9781

   micro avg       0.72      0.72      0.72     14331
   macro avg       0.90      0.31      0.35     14331
weighted avg       0.78      0.72      0.62     14331



From the data above it appears that the best model to use going forwards is going the Random Forest Classifier.  It the best accuracy combined with the best recall scores.

In [77]:
# Test the random forest classifier
rfc.score(X_test, Y_test)

0.6846304135460762

Let's drop the features that have very low frequencies.  Figure out how to use n-grams and normalize words like loved or loving to love.

In [80]:
# by setting the max_features in the CountVectorizer we can select the most frequent words in the file
cv_max = CountVectorizer(max_features=1000, ngram_range=(1,2))
cv_max.fit(df.summary)
X_max = cv_max.transform(df.summary)
print(X_max.shape)
Xm_train, Xm_test, Ym_train, Ym_test = train_test_split(X_max, df.overall, test_size=0.3)


(20473, 1000)


In [81]:
topwords = pd.DataFrame(list(zip(cv_max.get_feature_names(), sum(X_max.toarray()))), 
             columns=['Word','Frequency']).sort_values(by='Frequency', ascending=False)
print(topwords.head())

        Word  Frequency
342    great       3294
317     good       2057
974    works       1663
656  product       1089
572     nice        742


In [82]:
classifiers=[LogisticRegression(), 
             BernoulliNB(),
             DecisionTreeClassifier(max_features='sqrt'),
             RandomForestClassifier(max_features='sqrt')]
for clf in classifiers:
    print(str(clf).strip('()'))
    baseline_model(clf, Xm_train, Ym_train, crossval=True)
    print('--'*50)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False
Baseline Model has a average cv score of 69.0 % accuracy score
Baseline Model has a score of: 0.7146744818924011
              precision    recall  f1-score   support

         1.0       0.81      0.18      0.29       377
         2.0       0.77      0.08      0.14       438
         3.0       0.67      0.20      0.30      1000
         4.0       0.57      0.15      0.24      2791
         5.0       0.72      0.98      0.83      9725

   micro avg       0.71      0.71      0.71     14331
   macro avg       0.71      0.32      0.36     14331
weighted avg       0.69      0.71      0.64     14331

----------------------------------------------------------------------------------------------------
BernoulliNB(alpha=1.0, binarize=0.

Let's go one step further and apply a Term Frequency - Inverse Document Frequency transform to the count matrix.  The term frequency part means that the frequencies will be divided by the total number of words in the set.  The inverse document frequency part is to downscale the weights of words that occur the most frequently in the matrix. The goal of using tf-idf instead of the raw frequencies (count matrix) of occurrence of a token in a given document is to scale down the impact of tokens that occur very frequently in a given corpus and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

In [96]:
#tfidf transform
tfidf = TfidfTransformer(norm='l2')
transformedX = tfidf.fit_transform(X_max)

# resplit
Xt_train, Xt_test, Yt_train, Yt_test = train_test_split(X_max, df.overall, test_size=0.3)

In [98]:
for clf in classifiers:
    print(str(clf).strip('()'))
    baseline_model(clf, Xt_train, Yt_train, crossval=True, cm=True)
    print('--'*50)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False
Baseline Model has a average cv score of 69.0 % accuracy score
Baseline Model has a score of: 0.7191403251692136
              precision    recall  f1-score   support

         1.0       0.76      0.18      0.29       369
         2.0       0.66      0.08      0.14       450
         3.0       0.64      0.23      0.34      1009
         4.0       0.59      0.15      0.24      2763
         5.0       0.73      0.98      0.84      9740

   micro avg       0.72      0.72      0.72     14331
   macro avg       0.68      0.33      0.37     14331
weighted avg       0.69      0.72      0.65     14331

Confusion Matrix:
[[  67    6   17   14  265]
 [  12   35   33   27  343]
 [   4    6  237   95  667]
 [   3    4   54  423 2279]
 [   

Not much change using the Tfidf, ngrams (1,2), max_features=1000, and normalization (both l1 and l2).  Since we are creating a sentiment analysis, let's code the outcome variable as a binary to try and capture more of the sentiment.

In [87]:
print(df.overall.value_counts())

5.0    13928
4.0     3967
3.0     1430
2.0      606
1.0      542
Name: overall, dtype: int64


Since there are so many 5.0 ratings we will combine 4.0 and 5.0 as good, and the rest will be considered bad.  This binary coding assumes that 3s are bad, but with such an imbalance between goods and bads I feel this will be okay.

In [101]:
# make outcome a binary
binary = np.where(df.overall >= 4, 1, 0)

# resplit
Xtb_train, Xtb_test, Ytb_train, Ytb_test = train_test_split(X_max, binary, test_size=0.3)

# re-run models
for clf in classifiers:
    print(str(clf).strip('()'))
    baseline_model(clf, Xtb_train, Ytb_train, crossval=True, cm=True)
    print('--'*50)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False
Baseline Model has a average cv score of 89.0 % accuracy score
Baseline Model has a score of: 0.8972158258321122
              precision    recall  f1-score   support

           0       0.80      0.25      0.38      1809
           1       0.90      0.99      0.94     12522

   micro avg       0.90      0.90      0.90     14331
   macro avg       0.85      0.62      0.66     14331
weighted avg       0.89      0.90      0.87     14331

Confusion Matrix:
[[  451  1358]
 [  115 12407]]
----------------------------------------------------------------------------------------------------
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True
Baseline Model has a average cv score of 88.0 % accuracy score
Baseline Model

In [102]:
rfc = baseline_model(RandomForestClassifier(), Xtb_train, Ytb_train, crossval=True, cm=True)

Baseline Model has a average cv score of 87.0 % accuracy score
Baseline Model has a score of: 0.9457818714674482
              precision    recall  f1-score   support

           0       0.91      0.64      0.75      1809
           1       0.95      0.99      0.97     12522

   micro avg       0.95      0.95      0.95     14331
   macro avg       0.93      0.81      0.86     14331
weighted avg       0.94      0.95      0.94     14331

Confusion Matrix:
[[ 1151   658]
 [  119 12403]]


In [108]:
print(rfc.score(Xtb_test, Ytb_test))
table = pd.crosstab(Ytb_test, rfc.predict(Xtb_test), margins=True)
print(table)
print(classification_report(Ytb_test, rfc.predict(Xtb_test)))

0.8650276782806904
col_0    0     1   All
row_0                 
0      207   562   769
1      267  5106  5373
All    474  5668  6142
              precision    recall  f1-score   support

           0       0.44      0.27      0.33       769
           1       0.90      0.95      0.92      5373

   micro avg       0.87      0.87      0.87      6142
   macro avg       0.67      0.61      0.63      6142
weighted avg       0.84      0.87      0.85      6142



With the parameters set above we have a 87% accuracy in the test set.  We are very good at predicting good values but struggle more with the bads.