# Explanation

- [Videos on Udemy](https://www.udemy.com/machinelearning/learn/lecture/10459594)

## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Importing the Dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [3]:
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


## Cleaning the Texts

### Importing additional Libraries

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dmitry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
ps = PorterStemmer()
corpus = []
for item in dataset['Review']:
    review = re.sub(r'[^a-zA-Z]', ' ', item)
    review = review.lower()
    review = [ps.stem(word) for word in review.split(' ') if word and word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

## Creating the Bag of Words Model

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## Splitting the Dataset into Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Naive Bayes Classifier

In [9]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = {}
naive_bayes['name'] = 'Naive Bayes'
naive_bayes['classifier'] = GaussianNB()
naive_bayes['classifier'].fit(X_train, y_train)

naive_bayes['y_pred'] = naive_bayes['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
naive_bayes['confusion_matrix'] = confusion_matrix(y_test, naive_bayes['y_pred'])

naive_bayes['tp'] = naive_bayes['confusion_matrix'][0, 0]
naive_bayes['fp'] = naive_bayes['confusion_matrix'][0, 1]
naive_bayes['fn'] = naive_bayes['confusion_matrix'][1, 0]
naive_bayes['tn'] = naive_bayes['confusion_matrix'][1, 1]

naive_bayes['accuracy'] = (naive_bayes['tp'] + naive_bayes['tn']) / (naive_bayes['tp'] + naive_bayes['tn'] + naive_bayes['fp'] + naive_bayes['fn'])
naive_bayes['precision'] = naive_bayes['tp'] / (naive_bayes['tp'] + naive_bayes['fp'])
naive_bayes['recall'] = naive_bayes['tp'] / (naive_bayes['tp'] + naive_bayes['fn'])
naive_bayes['f1_score'] = 2 * naive_bayes['precision'] * naive_bayes['recall'] / (naive_bayes['precision'] + naive_bayes['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(naive_bayes['accuracy'],
                                                                     naive_bayes['precision'],
                                                                     naive_bayes['recall'],
                                                                     naive_bayes['f1_score']))

Accuracy: 0.73
Precision: 0.5670103092783505
Recall: 0.8208955223880597
F1 Score: 0.6707317073170731


## Decision Tree Classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = {}
decision_tree['name'] = 'Decision Tree'
decision_tree['classifier'] = DecisionTreeClassifier(criterion='entropy', random_state=0)
decision_tree['classifier'].fit(X_train, y_train)

decision_tree['y_pred'] = decision_tree['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
decision_tree['confusion_matrix'] = confusion_matrix(y_test, decision_tree['y_pred'])

decision_tree['tp'] = decision_tree['confusion_matrix'][0, 0]
decision_tree['fp'] = decision_tree['confusion_matrix'][0, 1]
decision_tree['fn'] = decision_tree['confusion_matrix'][1, 0]
decision_tree['tn'] = decision_tree['confusion_matrix'][1, 1]

decision_tree['accuracy'] = (decision_tree['tp'] + decision_tree['tn']) / (decision_tree['tp'] + decision_tree['tn'] + decision_tree['fp'] + decision_tree['fn'])
decision_tree['precision'] = decision_tree['tp'] / (decision_tree['tp'] + decision_tree['fp'])
decision_tree['recall'] = decision_tree['tp'] / (decision_tree['tp'] + decision_tree['fn'])
decision_tree['f1_score'] = 2 * decision_tree['precision'] * decision_tree['recall'] / (decision_tree['precision'] + decision_tree['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(decision_tree['accuracy'],
                                                                     decision_tree['precision'],
                                                                     decision_tree['recall'],
                                                                     decision_tree['f1_score']))

Accuracy: 0.71
Precision: 0.7628865979381443
Recall: 0.6788990825688074
F1 Score: 0.7184466019417477


## Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
random_forest = {}
random_forest['name'] = 'Random Forest'
random_forest['classifier'] = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
random_forest['classifier'].fit(X_train, y_train)

random_forest['y_pred'] = random_forest['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
random_forest['confusion_matrix'] = confusion_matrix(y_test, random_forest['y_pred'])

random_forest['tp'] = random_forest['confusion_matrix'][0, 0]
random_forest['fp'] = random_forest['confusion_matrix'][0, 1]
random_forest['fn'] = random_forest['confusion_matrix'][1, 0]
random_forest['tn'] = random_forest['confusion_matrix'][1, 1]

random_forest['accuracy'] = (random_forest['tp'] + random_forest['tn']) / (random_forest['tp'] + random_forest['tn'] + random_forest['fp'] + random_forest['fn'])
random_forest['precision'] = random_forest['tp'] / (random_forest['tp'] + random_forest['fp'])
random_forest['recall'] = random_forest['tp'] / (random_forest['tp'] + random_forest['fn'])
random_forest['f1_score'] = 2 * random_forest['precision'] * random_forest['recall'] / (random_forest['precision'] + random_forest['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(random_forest['accuracy'],
                                                                     random_forest['precision'],
                                                                     random_forest['recall'],
                                                                     random_forest['f1_score']))

Accuracy: 0.72
Precision: 0.8969072164948454
Recall: 0.6541353383458647
F1 Score: 0.7565217391304349


## SVM Classifier

In [12]:
from sklearn.svm import SVC
svm = {}
svm['name'] = 'SVM'
svm['classifier'] = SVC(kernel='linear', random_state=0)
svm['classifier'].fit(X_train, y_train)

svm['y_pred'] = svm['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
svm['confusion_matrix'] = confusion_matrix(y_test, svm['y_pred'])

svm['tp'] = svm['confusion_matrix'][0, 0]
svm['fp'] = svm['confusion_matrix'][0, 1]
svm['fn'] = svm['confusion_matrix'][1, 0]
svm['tn'] = svm['confusion_matrix'][1, 1]

svm['accuracy'] = (svm['tp'] + svm['tn']) / (svm['tp'] + svm['tn'] + svm['fp'] + svm['fn'])
svm['precision'] = svm['tp'] / (svm['tp'] + svm['fp'])
svm['recall'] = svm['tp'] / (svm['tp'] + svm['fn'])
svm['f1_score'] = 2 * svm['precision'] * svm['recall'] / (svm['precision'] + svm['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(svm['accuracy'],
                                                                     svm['precision'],
                                                                     svm['recall'],
                                                                     svm['f1_score']))

Accuracy: 0.72
Precision: 0.7628865979381443
Recall: 0.6915887850467289
F1 Score: 0.7254901960784315


## Kernel SVM Classifier

In [13]:
from sklearn.svm import SVC
kernel_svm = {}
kernel_svm['name'] = 'Kernel SVM'
kernel_svm['classifier'] = SVC(kernel='rbf', random_state=0)
kernel_svm['classifier'].fit(X_train, y_train)

kernel_svm['y_pred'] = kernel_svm['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
kernel_svm['confusion_matrix'] = confusion_matrix(y_test, kernel_svm['y_pred'])

kernel_svm['tp'] = kernel_svm['confusion_matrix'][0, 0]
kernel_svm['fp'] = kernel_svm['confusion_matrix'][0, 1]
kernel_svm['fn'] = kernel_svm['confusion_matrix'][1, 0]
kernel_svm['tn'] = kernel_svm['confusion_matrix'][1, 1]

kernel_svm['accuracy'] = (kernel_svm['tp'] + kernel_svm['tn']) / (kernel_svm['tp'] + kernel_svm['tn'] + kernel_svm['fp'] + kernel_svm['fn'])
kernel_svm['precision'] = kernel_svm['tp'] / (kernel_svm['tp'] + kernel_svm['fp'])
kernel_svm['recall'] = kernel_svm['tp'] / (kernel_svm['tp'] + kernel_svm['fn'])
kernel_svm['f1_score'] = 2 * kernel_svm['precision'] * kernel_svm['recall'] / (kernel_svm['precision'] + kernel_svm['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(kernel_svm['accuracy'],
                                                                     kernel_svm['precision'],
                                                                     kernel_svm['recall'],
                                                                     kernel_svm['f1_score']))



Accuracy: 0.485
Precision: 1.0
Recall: 0.485
F1 Score: 0.6531986531986532


## Logistic Regression Classifier

In [14]:
from sklearn.linear_model import LogisticRegression
logistic_regression = {}
logistic_regression['name'] = 'Logistic Regression'
logistic_regression['classifier'] = LogisticRegression(random_state=0)
logistic_regression['classifier'].fit(X_train, y_train)

logistic_regression['y_pred'] = logistic_regression['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
logistic_regression['confusion_matrix'] = confusion_matrix(y_test, logistic_regression['y_pred'])

logistic_regression['tp'] = logistic_regression['confusion_matrix'][0, 0]
logistic_regression['fp'] = logistic_regression['confusion_matrix'][0, 1]
logistic_regression['fn'] = logistic_regression['confusion_matrix'][1, 0]
logistic_regression['tn'] = logistic_regression['confusion_matrix'][1, 1]

logistic_regression['accuracy'] = (logistic_regression['tp'] + logistic_regression['tn']) / (logistic_regression['tp'] + logistic_regression['tn'] + logistic_regression['fp'] + logistic_regression['fn'])
logistic_regression['precision'] = logistic_regression['tp'] / (logistic_regression['tp'] + logistic_regression['fp'])
logistic_regression['recall'] = logistic_regression['tp'] / (logistic_regression['tp'] + logistic_regression['fn'])
logistic_regression['f1_score'] = 2 * logistic_regression['precision'] * logistic_regression['recall'] / (logistic_regression['precision'] + logistic_regression['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(logistic_regression['accuracy'],
                                                                     logistic_regression['precision'],
                                                                     logistic_regression['recall'],
                                                                     logistic_regression['f1_score']))

Accuracy: 0.71
Precision: 0.7835051546391752
Recall: 0.672566371681416
F1 Score: 0.7238095238095238




## KNN Classifier

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = {}
knn['name'] = 'KNN'
knn['classifier'] = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn['classifier'].fit(X_train, y_train)

knn['y_pred'] = knn['classifier'].predict(X_test)

from sklearn.metrics import confusion_matrix
knn['confusion_matrix'] = confusion_matrix(y_test, knn['y_pred'])

knn['tp'] = knn['confusion_matrix'][0, 0]
knn['fp'] = knn['confusion_matrix'][0, 1]
knn['fn'] = knn['confusion_matrix'][1, 0]
knn['tn'] = knn['confusion_matrix'][1, 1]

knn['accuracy'] = (knn['tp'] + knn['tn']) / (knn['tp'] + knn['tn'] + knn['fp'] + knn['fn'])
knn['precision'] = knn['tp'] / (knn['tp'] + knn['fp'])
knn['recall'] = knn['tp'] / (knn['tp'] + knn['fn'])
knn['f1_score'] = 2 * knn['precision'] * knn['recall'] / (knn['precision'] + knn['recall'])

print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1 Score: {}'.format(knn['accuracy'],
                                                                     knn['precision'],
                                                                     knn['recall'],
                                                                     knn['f1_score']))

Accuracy: 0.61
Precision: 0.7628865979381443
Recall: 0.5736434108527132
F1 Score: 0.654867256637168


## Comparison

In [16]:
classifiers = [naive_bayes, decision_tree, random_forest, svm, kernel_svm, logistic_regression, knn]

pd.DataFrame([[classifier['accuracy'], classifier['precision'], classifier['recall'], classifier['f1_score']] for classifier in classifiers],
             [classifier['name'] for classifier in classifiers],
             ['Accuracy', 'Precision', 'Recall', 'F1 Score'])

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Naive Bayes,0.73,0.56701,0.820896,0.670732
Decision Tree,0.71,0.762887,0.678899,0.718447
Random Forest,0.72,0.896907,0.654135,0.756522
SVM,0.72,0.762887,0.691589,0.72549
Kernel SVM,0.485,1.0,0.485,0.653199
Logistic Regression,0.71,0.783505,0.672566,0.72381
KNN,0.61,0.762887,0.573643,0.654867
