# Testing Literature Survey 2 - E Commerce

In this notebook, we run our project using a dataset of Flipkart Product Reviews acquired through Kaggle. We found this dataset to be very similar to the one used in our literature survey. By testing our models, we can draw a comparison between our performance and that of Naive Bayes reported in the research papers.

## Loading Dataset and Libraries ##

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import string
import os

import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
import seaborn as sns
import matplotlib.pyplot as plt
import operator
import folium
from itertools import cycle, islice
from pandas import options
import warnings
import pickle
import nltk
from matplotlib.pyplot import figure
from nltk.corpus import stopwords 
import nltk


nltk.download('wordnet')
nltk.download('stopwords')
from tqdm import tqdm,tqdm_notebook

from  wordcloud import WordCloud
%matplotlib inline

[nltk_data] Downloading package wordnet to C:\Users\Abhinav
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Abhinav
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# DATA_SET = './datasets/aclimdb.csv'
df =pd.read_csv('ECommerceDataset.csv',encoding='latin1', low_memory=False)

In [3]:
df.review = df.review.apply(lambda x: x if isinstance(x, str)==True else np.nan)

In [4]:
df.sentiment = pd.to_numeric(df.sentiment, errors='coerce')

In [5]:
df.sentiment = df.sentiment.apply(lambda x: 1 if x>=3 else 0)

In [6]:
df.dropna(subset=['sentiment'], inplace=True)

In [7]:
df.dropna(subset=['review'], inplace=True)

In [8]:
df.drop(df[df.index > 150000].index, inplace = True)

## Preprocessing Data ##

In [9]:
#lowercasing all the words in the review
df['review']=df['review'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,review,sentiment
0,it's really worth every single penny. it works...,1
1,i bought crompton ozone 75 desert air cooler i...,1
2,great packaging by seller. as this was the mos...,1
3,delivery was delayed by two days except this e...,1
4,a good cooler by crompton. the height of the c...,1


In [10]:
#contraction to expansion : 
#converting the words in their contracted form to their extracted form eg. he'll to he will
#using the cont_to_exp() and a dictionary:{key: contractions,value:expansion}
contractions = { 
"ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not",
"hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not",
"he'd": "he would","he'd've": "he would have","he'll": "he will","he'll've": "he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how does",
"i'd": "i would","i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have","isn't": "is not",
"it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is","let's": "let us",
"ma'am": "madam","mayn't": "may not",
"might've": "might have","mightn't": "might not","mightn't've": "might not have",
"must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have",
"o'clock": "of the clock","oughtn't": "ought not","oughtn't've": "ought not have",
"shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
"she'd": "she would","she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
"should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
"so've": "so have","so's": "so is",
"that'd": "that would","that'd've": "that would have","that's": "that is",
"there'd": "there would","there'd've": "there would have","there's": "there is",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
"to've": "to have","wasn't": "was not",
" u ": " you "," ur ": " your "," n ": " and ",
"ain't": "is not", "aren't": "are not","can't": "cannot", "cause": "because", "could've": "could have", "couldn't": "could not",
"didn't": "did not",  "doesn't": "does not", "don't": "do not", 
"hadn't": "had not", "hasn't": "has not", "haven't": "have not",
"he'd": "he would","he'll": "he will", "he's": "he is", 
"how'd": "how did", "how'd'y": "how do you", 
"how'll": "how will", "how's": "how is",
"I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
"i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
"it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
"mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
"mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
"oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
"should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
"this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
"there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
"they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
"wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
"we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
"what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
"where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
"why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
"would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
"you're": "you are", "you've": "you have"}
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else : 
        return x
df['review'] = df['review'].apply(lambda x:cont_to_exp(x))


In [11]:
# Removing the urls from the reviews
df['review']=df['review'].apply(lambda x: re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','',x)) 

In [12]:
#Stopword removal : Stopwords are the words that appear quite frequently in a sentence and do not have a significant contribution to the meaning of the sentence. Therefore they can be removed.
df['review'] = df['review'].apply(lambda x:" ".join([t for t in x.split() if t not in STOP_WORDS ]))


In [13]:
# Removal of special characters from the reviews
df['review']=df['review'].apply(lambda x:re.sub(r'[^0-9a-zA-Z *]','',x))
df['review']=df['review'].apply(lambda x:re.sub(r'[^a-zA-z0-9\s]','',x))

df['review'] = df['review'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))


In [14]:
#Removal of mulitple spaces between the words in the review
df["review"]=df["review"].apply(lambda x: " ".join(x.split()))


In [15]:
#Removal of HTML Tags: from the reviews
df['review'] = df['review'].apply(lambda x:BeautifulSoup(x,'lxml').get_text())

#Remove tags and links 
tag = re.compile(r'<[^>]+>')

df['review'] = df['review'].apply(lambda x: tag.sub('', x)) #removing html labels

df['review'] = df['review'].replace(r'http\S+', '', regex=True).replace(r'www.\S+', '', regex=True).replace(r'http\S+', '', regex=True).replace(r'"', '', regex=True)

df

Unnamed: 0,review,sentiment
0,worth single penny works like ton ac provided ...,1
1,bought crompton ozone 75 desert air cooler mon...,1
2,great packaging seller important point transpo...,1
3,delivery delayed days finesafely packedair flo...,1
4,good cooler crompton height cooler 3ft 10 inch...,1
...,...,...
149996,good product,1
149997,awsm,1
149998,nice product,1
149999,beautiful,1


In [16]:
# Removal of Numbers
df['review']=df['review'].apply(lambda x:re.sub(r'[0-9]+','',x))

In [17]:
# Removal of usernames from the reviews
df['review']=df['review'].apply(lambda x:re.sub(r'@[A-Za-z0–9]+','',x))
df.sample(10)

Unnamed: 0,review,sentiment
41842,bad quality,0
130252,nice,1
27676,sound quality fair loud presence noises bass p...,1
49977,superb,1
132128,nice product,1
111091,buy,0
32637,excellent voice,1
23627,nice good,1
69876,nice product,1
51428,goodbat enjoycorona holiday play matches light...,1


In [18]:
# Tokenization and Lemmanization

nltk.download('omw-1.4')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w, pos="v") for w in w_tokenizer.tokenize(text)])

df['review'] = df.review.apply(lemmatize_text).copy()

df

[nltk_data] Downloading package omw-1.4 to C:\Users\Abhinav
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,review,sentiment
0,worth single penny work like ton ac provide ro...,1
1,buy crompton ozone desert air cooler month sum...,1
2,great package seller important point transport...,1
3,delivery delay days finesafely packedair flow ...,1
4,good cooler crompton height cooler ft inch col...,1
...,...,...
149996,good product,1
149997,awsm,1
149998,nice product,1
149999,beautiful,1


## Splitting Data ##

In [19]:
# Splitting Dataset into training and testing sets
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score

x = pd.DataFrame(df, columns = ['review']) 
y = pd.DataFrame(df, columns = ['sentiment']) 

# Split dataset to train and test set.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

print("Shape of x_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of x_test:  ", X_test.shape)
print("Shape of y_test:  ", y_test.shape)

Shape of x_train:  (110998, 1)
Shape of y_train:  (110998, 1)
Shape of x_test:   (37000, 1)
Shape of y_test:   (37000, 1)


## Vectorization with TF-IDF ##

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

train = X_train['review'].tolist()
test = X_test['review'].tolist()
STOPWORDS = set(stopwords.words('english'))

tfidf_vectorizer = TfidfVectorizer(max_features=1000, dtype=np.float32)

tfidfX_train = tfidf_vectorizer.fit_transform(train)
tfidfX_train = tfidfX_train.toarray()

tfidfX_test = tfidf_vectorizer.transform(test)
tfidfX_test = tfidfX_test.toarray()

print("TF-IDF train shape:", tfidfX_train.shape)
print("TF-IDF test shape:", tfidfX_test.shape)

TF-IDF train shape: (110998, 1000)
TF-IDF test shape: (37000, 1000)


## Decision Tree ##

In [21]:
from sklearn.tree import DecisionTreeClassifier

dct = DecisionTreeClassifier(criterion='entropy', random_state=1)
dct.fit(tfidfX_train,y_train)

DecisionTreeClassifier(criterion='entropy', random_state=1)

In [22]:
# Making Predictions
y_pred_dct = dct.predict(tfidfX_test)

# Evaluation Metrics

dct_accuracy = accuracy_score(y_test,y_pred_dct)*100
dct_matrix = confusion_matrix(y_test,y_pred_dct)
dct_precision = dct_matrix[0][0]*100/(dct_matrix[0][0]+dct_matrix[1][0])
dct_recall = dct_matrix[0][0]*100/(dct_matrix[0][0]+dct_matrix[0][1])

print("Accuracy : ",dct_accuracy)
print("Confusion_matrix:\n",dct_matrix)
# print("precision:",dct_precision)
# print("recall:",dct_recall)

Accuracy :  92.34324324324325
Confusion_matrix:
 [[ 3781  1632]
 [ 1201 30386]]


## XG Boosting ##

In [23]:
from xgboost import XGBClassifier

xg = XGBClassifier(random_state=22,learning_rate=0.9)
xg.fit(tfidfX_train,y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.9, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=22, ...)

In [24]:
# Making Predictions
y_pred_xg = xg.predict(tfidfX_test)

# Evaluation Metrics

xg_accuracy = accuracy_score(y_test,y_pred_xg)*100
xg_matrix = confusion_matrix(y_test,y_pred_xg)
xg_precision = xg_matrix[0][0]*100/(xg_matrix[0][0]+xg_matrix[1][0])
xg_recall = xg_matrix[0][0]*100/(xg_matrix[0][0]+xg_matrix[0][1])

print("Accuracy : ",xg_accuracy)
print("Confusion_matrix:\n",xg_matrix)
# print("precision:",xg_precision)
# print("recall:",xg_recall)

Accuracy :  93.18108108108109
Confusion_matrix:
 [[ 3684  1729]
 [  794 30793]]


## Random Forest ##

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=150,max_depth=None)
rf1.fit(tfidfX_train,y_train)

  rf1.fit(tfidfX_train,y_train)


RandomForestClassifier(n_estimators=150)

In [26]:
# Making Predictions
y_pred_rf1 = rf1.predict(tfidfX_test)

# Evaluation Metrics

rf1_accuracy = accuracy_score(y_test,y_pred_rf1)*100
rf1_matrix = confusion_matrix(y_test,y_pred_rf1)
rf1_precision = rf1_matrix[0][0]*100/(rf1_matrix[0][0]+rf1_matrix[1][0])
rf1_recall = rf1_matrix[0][0]*100/(rf1_matrix[0][0]+rf1_matrix[0][1])

print("Accuracy : ",rf1_accuracy)
print("Confusion_matrix:\n",rf1_matrix)
# print("precision:",rf1_precision)
# print("recall:",rf1_recall)

Accuracy :  93.48378378378378
Confusion_matrix:
 [[ 3729  1684]
 [  727 30860]]


## Logistic Regression ##

In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0,solver='lbfgs')

lr.fit(tfidfX_train,y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [28]:
# Making Predictions
y_pred_lr = lr.predict(tfidfX_test)

# Evaluation Metrics

lr_accuracy = accuracy_score(y_test,y_pred_lr)*100
lr_matrix = confusion_matrix(y_test,y_pred_lr)
lr_precision = lr_matrix[0][0]*100/(lr_matrix[0][0]+lr_matrix[1][0])
lr_recall = lr_matrix[0][0]*100/(lr_matrix[0][0]+lr_matrix[0][1])

print("Accuracy : ",lr_accuracy)
print("Confusion_matrix:\n",lr_matrix)
# print("precision:",lr_precision)
# print("recall:",lr_recall)

Accuracy :  92.61351351351351
Confusion_matrix:
 [[ 3383  2030]
 [  703 30884]]


## Extra Tree Classifier ##

In [29]:
from sklearn.ensemble import ExtraTreesClassifier

etc = ExtraTreesClassifier(random_state=123)
etc.fit(tfidfX_train,y_train)

  etc.fit(tfidfX_train,y_train)


ExtraTreesClassifier(random_state=123)

In [30]:
# Making Predictions
y_pred_etc = etc.predict(tfidfX_test)

# Evaluation Metrics

etc_accuracy = accuracy_score(y_test,y_pred_etc)*100
etc_matrix = confusion_matrix(y_test,y_pred_etc)
etc_precision = etc_matrix[0][0]*100/(etc_matrix[0][0]+etc_matrix[1][0])
etc_recall = etc_matrix[0][0]*100/(etc_matrix[0][0]+etc_matrix[0][1])

print("Accuracy : ",etc_accuracy)
print("Confusion_matrix:\n",etc_matrix)
# print("precision:",etc_precision)
# print("recall:",etc_recall)

Accuracy :  93.3891891891892
Confusion_matrix:
 [[ 3739  1674]
 [  772 30815]]


## Voting ##

In [31]:
# Defining Estimators

estimators = [('dct',dct),('xg',xg),('rf1',rf1),('lr',lr),('etc',etc)]
estimators

[('dct', DecisionTreeClassifier(criterion='entropy', random_state=1)),
 ('xg',
  XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=0.9, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=None, num_parallel_tree=None,
                predictor=None, random_state=22, ...)),
 ('rf1', RandomForestClassifier(n_estimators=150)),
 ('lr', LogisticRegression(random_state=0)),
 ('etc', ExtraTreesClassifier(random_state=123))]

In [32]:
# Evaluating the Estimator

from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators, weights=[0.5,1,2.5,1,2.5])
vc.fit(tfidfX_train,y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('dct',
                              DecisionTreeClassifier(criterion='entropy',
                                                     random_state=1)),
                             ('xg',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            gpu_id=None, grow_policy=...
                                            max_delta_step=None, max_depth=None,
           

In [33]:
# Making Predictions
y_pred_vc = vc.predict(tfidfX_test)

# Evaluation Metrics

vc_accuracy = accuracy_score(y_test,y_pred_vc)*100
vc_matrix = confusion_matrix(y_test,y_pred_vc)
vc_precision = vc_matrix[0][0]*100/(vc_matrix[0][0]+vc_matrix[1][0])
vc_recall = vc_matrix[0][0]*100/(vc_matrix[0][0]+vc_matrix[0][1])

print("Accuracy : ",vc_accuracy)
print("Confusion_matrix:\n",vc_matrix)
# print("precision:",vc_precision)
# print("recall:",vc_recall)

Accuracy :  93.53783783783783
Confusion_matrix:
 [[ 3741  1672]
 [  719 30868]]


In [35]:
print("Testing Accuracies")
acc_list = {
    'Decision Tree':dct_accuracy,
    'XG':xg_accuracy,
    'Random Forest':rf1_accuracy,
    'Logistic Regression':lr_accuracy,
    'Extra Tree Classifier':etc_accuracy,
    'Voting Classifier':vc_accuracy
}
acc_df_test = pd.DataFrame.from_dict(acc_list,orient="index",columns=['Accuracy'])
acc_df_test

Testing Accuracies


Unnamed: 0,Accuracy
Decision Tree,92.343243
XG,93.181081
Random Forest,93.483784
Logistic Regression,92.613514
Extra Tree Classifier,93.389189
Voting Classifier,93.537838
