# Natural Language 

In [1]:
#Import Libraries
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

In [2]:
# Load the dataset which is a dataframe
# Quoting parameter is used here to remove all the double inverted commas in the tsv file(3)
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amitc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Stopwords is a list of words that shouldn't be present in a string as it is useless
from nltk.corpus import stopwords

# PorterStemmer performs stemming for every word
from nltk.stem.porter import PorterStemmer

In [5]:
# Perform data cleaning and preprocessing
corpus = []
for i in range(0, 1000):
    # [\w] matches (alphanumeric or underscore) and [\W] matches (not (alphanumeric or underscore))
    review = re.sub(r'[\W_]+', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
for i in range(5):
    print(corpus[i])

wow love place
crust good
tasti textur nasti
stop late may bank holiday rick steve recommend love
select menu great price


In [7]:
# Create the bag of words model
from sklearn.feature_extraction.text import CountVectorizer

# Max-features parameter is used to reduce the sparcity of the sparse matrix
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
print(X)
print(len(X), len(X[0]))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
1000 1500


In [10]:
# Splitting data into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [11]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [14]:
# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[55 42]
 [13 90]]


In [15]:
# We can see our model made 55 + 90 = 145 Correct Prections and 42 + 13 = 55 incorrect predictions
acc_percent = (145/200)*100
print(acc_percent)

72.5


In [None]:
# Given that we only had 800 observations to train our model 72.5% is a decent accuracy test score!