###### Credit : superdatascience.com

Business case : Analyse restaurant review data to classify and predict positive & negative reviews

## Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## Preprocessing

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dahee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
dataset.head(10)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [6]:
corpus = []
for i in range(0, 1000) :
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])     # Punctuation conversion
    review = review.lower()                                     # Lower-case conversion
    review = review.split()                                     # Split by words
    ps = PorterStemmer()                                        # Create stemming object
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]      # Applying stemming
    # Join the elements back together into the original format
    review = ' '.join(review)
    corpus.append(review)                                             # Add all the cleaned reviews

In [7]:
corpus[0:10]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

## 1/ Random Forest

##### Vectorisation

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)          # Remove abundant words (take most frequent words)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [9]:
len(X[0])

1500

##### Set Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

##### Model

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 500)
classifier.fit(X_train, y_train)

##### Prediction

In [12]:
y_pred = classifier.predict(X_test)

##### Evaluation

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(cm)
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81        97
           1       0.89      0.66      0.76       103

    accuracy                           0.79       200
   macro avg       0.81      0.79      0.78       200
weighted avg       0.81      0.79      0.78       200

[[89  8]
 [35 68]]


0.785

## 2/ Naive Bayes

##### Model

In [15]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

##### Prediction

In [16]:
y_pred = classifier.predict(X_test)

##### Evaluation

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))
print(cm)
accuracy_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200

[[55 42]
 [12 91]]


0.73