# Yelp Review Dataset

In [1]:
import numpy as np
import pandas as pd

## Importing the dataset

In [3]:
dataset = pd.read_csv('yelp_labelled.txt',delimiter='\t',quoting=3)

In [4]:
dataset

Unnamed: 0,Reviews,Label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


## Creating the bag of words model

In [5]:
import re
import nltk

In [6]:
#nltk.download('stopwords') #  Uncomment this line if stopwords is not downloaded
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^A-z]', ' ' ,dataset['Reviews'][i]).lower().split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Label']

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Classification using Gaussian NB model

In [10]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None)

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.73
[[55 42]
 [12 91]]


## Classification using Decision Tree model

In [56]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=17)
classifier.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=17,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [57]:
y_pred = classifier.predict(X_test)

In [58]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.72
[[93  4]
 [52 51]]


## Classification using Random Forest model

In [74]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,)
classifier.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [75]:
y_pred = classifier.predict(X_test)

In [76]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.71
[[86 11]
 [47 56]]


## Classification using Gradient Boosting model

In [95]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(learning_rate=0.8,n_estimators=)
classifier.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.8, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [96]:
y_pred = classifier.predict(X_test)

In [97]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.755
[[83 14]
 [35 68]]
