### Import libraries

In [19]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup

import joblib

### Import data And Preprocessing

In [3]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.shape

(1000, 2)

In [5]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [6]:
# check if there are empty review
(df.Review.str.len() == 0).any()

False

In [7]:
df.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

### Text Pre-Processing

<ul>
    <li>Converting to lower case</li>
    <li>Tokenising</li>
    <li>Removing stop words</li>
    <li>Words stemming</li>
    <li>Removing punctuation</li>  
    <li>Stripping out html tags</li>
    
</ul>

In [8]:
# downlad 'punkt' for text tokenising
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/programmer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# downlad list of stopwords from nltk library
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/programmer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# function to perform all text processing
def clean_reviews(review, stemmer=PorterStemmer(), stop_words=set(stopwords.words('english'))):
    
    # remove html tags
    soup = BeautifulSoup(review, 'html.parser')
    clean_text = soup.get_text()
    
    # convert to lower case and splits up the words
    words = word_tokenize(clean_text.lower())
    
    filter_words = []
    
    for word in words:
        # removing the stop words and punctuation 
        if word not in stop_words and word.isalpha():
            filter_words.append(stemmer.stem(word)) # words stemming
    
    return ' '.join(filter_words)

In [11]:
df['Review'] = df.Review.apply(clean_reviews)
df.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


### Vectorization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(max_features=1000)
X = tfv.fit_transform(df.Review).toarray()

In [16]:
X[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
y = df.Liked

In [17]:
y[:5]

0    1
1    0
2    0
3    1
4    1
Name: Liked, dtype: int64

In [20]:
joblib.dump(tfv, 'tfv-transform.pkl')

['tfv-transform.pkl']

### Train Test split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Train a model

In [21]:
# best algorithm for text classification is Naive Bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [22]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[74, 22],
       [29, 75]])

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.77      0.74        96
           1       0.77      0.72      0.75       104

    accuracy                           0.74       200
   macro avg       0.75      0.75      0.74       200
weighted avg       0.75      0.74      0.75       200



###  save the model


In [24]:
joblib.dump(model, 'review-model.pkl')

['review-model.pkl']