# Amazon Review Dataset

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [13]:
dataset = pd.read_csv('amazon_cells_labelled.txt',delimiter='\t',quoting=3)

In [14]:
dataset

Unnamed: 0,Reviews,Label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


## Creating the bag of words model

In [18]:
import re
import nltk

In [20]:
#nltk.download('stopwords') #  Uncomment this line if stopwords is not downloaded
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [22]:
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^A-z]', ' ' ,dataset['Reviews'][i]).lower().split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Label']

## Train test Split

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Classification using Gaussian NB model

In [27]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [28]:
y_pred = classifier.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.72
[[60 36]
 [20 84]]


## Classification using Decision Tree model

In [149]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=20,min_samples_split=0.5)
classifier.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=0.5,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [150]:
y_pred = classifier.predict(X_test)

In [151]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.785
[[90  6]
 [37 67]]


## Classification using Random Forest model

In [200]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=22)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=22, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [201]:
y_pred = classifier.predict(X_test)

In [202]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.83
[[86 10]
 [24 80]]


## Classification using Gradient Boosting model

In [321]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(learning_rate=0.675,n_estimators=140,max_depth=10)
classifier.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.675, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=140,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [322]:
y_pred = classifier.predict(X_test)

In [323]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.83
[[89  7]
 [27 77]]
