In [2]:
import pandas as pd
import numpy as np

In [106]:
df = pd.read_csv('/Users/craiglynch/Desktop/Data_Science_Review/Restaurant Reviews NLP/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Preprocessing Data

In [107]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craiglynch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
# Remove Punctuation
import string
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
df['Review'] = df['Review'].apply(remove_punctuations)

# Lowercase
df['Review'] = df['Review'].str.lower()

# Split to Array
#review = review.str.split()

# Stem words
ps = PorterStemmer()
df['Review'] = [ps.stem(word) for word in df['Review'] if not word in stop]

# Drop stop words
df['Review_without_stopwords'] = df['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


# # Rejoin array elements back into a string
# review = ' '.join(review)

# # Append string to create array of cleaned text
# vocab.append(review)

In [109]:
review = df['Review_without_stopwords'].tolist()

In [110]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer

# Extract max 1000 features (can adjust this during further iterations to see how it affects)
cv = CountVectorizer(max_features = 1000)
# Vocab List
X = cv.fit_transform(review).toarray()
# Is the review positive or negative
y = df.iloc[:, 1].values


In [111]:
print(X.shape)
print(y.shape)

(1000, 1000)
(1000,)


# Splitting Data into Train/Test

In [113]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 15)

In [117]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(accuracy*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

[[ 92  25]
 [ 31 102]]


Accuracy is  77.6 %
Precision is  0.8
Recall is  0.77


In [124]:
# Logistic Regression

from sklearn import linear_model
linear = linear_model.LogisticRegression(C=3)
linear.fit(X_train, y_train)

y_pred = linear.predict(X_test)

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy, Precision and Recall
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(accuracy*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

[[100  17]
 [ 37  96]]


Accuracy is  78.4 %
Precision is  0.85
Recall is  0.72


In [126]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
 
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy')                           
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Accuracy, Precision and Recall
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(accuracy*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

[[96 21]
 [49 84]]


Accuracy is  72.0 %
Precision is  0.8
Recall is  0.63


## Logistic regression model is correct 78.4% of the time, and is our best model

Dataset from:
https://www.kaggle.com/hj5992/restaurantreviews

Reference:
https://www.kaggle.com/apekshakom/sentiment-analysis-of-restaurant-reviews