In [1]:
import numpy as np 
import pandas as pd

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [4]:
corpus = []

for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower().split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
corpus[:3]

['wow love place', 'crust good', 'tasti textur nasti']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset['Liked']

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2, random_state=0)

In [8]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(xtrain,ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [9]:
ypred = clf.predict(xtest);

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(ytest, ypred)
print(cm)
print(accuracy_score(ytest, ypred))

[[55 42]
 [12 91]]
0.73


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [12]:
models = [['Gaussian Classifier', GaussianNB()],
         ['SVC', SVC(kernel='linear', random_state=0)],
         ['Kernel SVM', SVC(kernel='rbf', random_state=0)],
         ['Random Forest Classifier', RandomForestClassifier(n_estimators=20, random_state=0)]]

In [14]:
print('Accuracy....')
for name, model in models:
    clf = model
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    print(name, ': ', accuracy_score(ytest, ypred))

Accuracy....
Gaussian Classifier :  0.73
SVC :  0.72
Kernel SVM :  0.735
Random Forest Classifier :  0.745
