In [1]:
# Importing Required libraries

import nltk
import string
import pandas as pd
import numpy as np

In [2]:
# Reading tsv file as csv file

df = pd.read_csv("/content/drive/MyDrive/Restaurant_Reviews.tsv", delimiter = '\t',quoting = 3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# Checking any Null value

df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [4]:
# Getting more information about dataset

print("Size of dataset", df.shape)
print("\n")
print("Number of liked Review \n", df['Liked'].value_counts())
print("\n")


Size of dataset (1000, 2)


Number of liked Review 
 1    500
0    500
Name: Liked, dtype: int64




In [None]:
# Downloading and importing Stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# All punctuation marks in stopwords

[punc for punc in string.punctuation]

In [7]:
# Removing stopword and lowering the words and creating Tokenized Review

def text_preproces(msg):
  nonpun = [char for char in msg if char not in string.punctuation]
  nonpun = ''.join(nonpun)
  return ' '.join([word for word in nonpun.split() if word.lower() not in stopwords.words('english')])

In [None]:
df.head()

In [145]:
df['tokenized_Review'] = df['Review'].apply(text_preproces)
df.head()

Unnamed: 0,Review,Liked,tokenized_Review
0,Wow... Loved this place.,1,Wow Loved place
1,Crust is not good.,0,Crust good
2,Not tasty and the texture was just nasty.,0,tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,Stopped late May bank holiday Rick Steve recom...
4,The selection on the menu was great and so wer...,1,selection menu great prices


In [144]:
# Converting Tokenized_Review words into vector using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 10)
X = vectorizer.fit_transform(df['tokenized_Review']).toarray()
print(X)
print("\n")
print("Shape of vector X:", X.shape)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.62941926]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


Shape of vector X: (1000, 99)


In [None]:
# Splitting Dataset into Traing and Testing dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_Review'], df['Liked'], random_state=107, test_size=0.2)
X_train.head()

In [12]:
train_vectorized = vectorizer.transform(X_train)
X_train_array = train_vectorized.toarray()
test_vectorized = vectorizer.transform(X_test)
X_test_array = test_vectorized.toarray()

# 1 - Training Model Using **GaussainNB**

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train_array, y_train)

In [None]:
# Testing the model on test dataset

y_train_pred_nb = nb.predict(X_train_array)
y_test_pred_nb = nb.predict(X_test_array)

In [None]:
# Comparing predicted and actual value

pd.DataFrame({'Acutual_y_Value':y_test, "Predicted_y_Value":y_test_pred_nb})

Unnamed: 0,Acutual_y_Value,Predicted_y_Value
906,0,0
24,1,0
706,1,1
958,0,0
355,1,1
...,...,...
657,1,1
754,0,0
839,1,0
495,0,0


In [71]:
# Evaluting differnt metrics:

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix ,roc_curve, classification_report

In [72]:
def Metrics_Report(real, predicted):
  print("Accuracy score is {}".format(accuracy_score(real, predicted)))
  print("Recall score is {}".format(recall_score(real, predicted)))
  print("Precision score is {}".format(precision_score(real, predicted)))
  print("F1 score is {}".format(f1_score(real, predicted)))
  print("Roc Auc score is {}".format(roc_auc_score(real, predicted)))
  print("Confusion Matrix is {}".format(confusion_matrix(real, predicted)))
  print("\n")
  print("Classification Report is {}".format(classification_report(real, predicted)))

In [None]:
# Model Performance on training dataset

Metrics_Report(y_train, y_train_pred_nb)

Accuracy score is 0.76625
Recall score is 0.6532663316582915
Precision score is 0.8414239482200647
F1 score is 0.7355021216407356
Roc Auc score is 0.7656878921973049
Confusion Matrix is [[353  49]
 [138 260]]


Classification Report is               precision    recall  f1-score   support

           0       0.72      0.88      0.79       402
           1       0.84      0.65      0.74       398

    accuracy                           0.77       800
   macro avg       0.78      0.77      0.76       800
weighted avg       0.78      0.77      0.76       800



In [None]:
# Model Performance on Testing Dataset

Metrics_Report(y_test, y_test_pred_nb)

Accuracy score is 0.73
Recall score is 0.6078431372549019
Precision score is 0.8157894736842105
F1 score is 0.6966292134831461
Roc Auc score is 0.7324929971988796
Confusion Matrix is [[84 14]
 [40 62]]


Classification Report is               precision    recall  f1-score   support

           0       0.68      0.86      0.76        98
           1       0.82      0.61      0.70       102

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.73       200
weighted avg       0.75      0.73      0.73       200



# 2 - Training Model Using **MultinomialNB**

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
mn = MultinomialNB()
mn.fit(X_train_array, y_train)

In [None]:
y_train_pred_mn = mn.predict(X_train_array)
y_test_pred_mn = mn.predict(X_test_array)

In [None]:
# Model Performance on training dataset

Metrics_Report(y_train, y_train_pred_mn)

Accuracy score is 0.77625
Recall score is 0.6859296482412061
Precision score is 0.8348623853211009
F1 score is 0.7531034482758621
Roc Auc score is 0.7758006450161254
Confusion Matrix is [[348  54]
 [125 273]]


Classification Report is               precision    recall  f1-score   support

           0       0.74      0.87      0.80       402
           1       0.83      0.69      0.75       398

    accuracy                           0.78       800
   macro avg       0.79      0.78      0.77       800
weighted avg       0.79      0.78      0.77       800



In [None]:
# Model Performance on Testing Dataset

Metrics_Report(y_test, y_test_pred_mn)

Accuracy score is 0.76
Recall score is 0.6764705882352942
Precision score is 0.8214285714285714
F1 score is 0.7419354838709677
Roc Auc score is 0.7617046818727491
Confusion Matrix is [[83 15]
 [33 69]]


Classification Report is               precision    recall  f1-score   support

           0       0.72      0.85      0.78        98
           1       0.82      0.68      0.74       102

    accuracy                           0.76       200
   macro avg       0.77      0.76      0.76       200
weighted avg       0.77      0.76      0.76       200



# 3 - Training Model Using **LSTM RNN**

In [13]:
# Importing tensorflow and related libraries

import tensorflow as tf


In [14]:
tf.__version__

'2.13.0'

In [49]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [16]:
# Extracting Tokenized_Review from dataframe

corpus = df['tokenized_Review'].tolist()
corpus[1]

'Crust good'

In [17]:
# Extracting Dependent variable

Y = df['Liked']

In [147]:
### Vocabulary size
voc_size = 300

In [148]:
# One hot Representation of corpus

onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[173, 299, 13],
 [249, 142],
 [95, 135, 253],
 [130, 12, 169, 188, 291, 260, 191, 228, 299],
 [258, 59, 83, 54],
 [288, 86, 81, 281, 258],
 [155, 235, 264, 178],
 [77, 21, 179, 106, 39, 109, 117, 93, 166, 94],
 [75, 83],
 [83, 94],
 [251, 42],
 [229, 217, 261],
 [65, 146, 211, 138, 234, 48, 182, 141],
 [153, 158, 55, 159, 246, 72],
 [135, 213, 13, 44, 95],
 [58, 112, 79, 89],
 [225, 292],
 [87, 184, 258, 251],
 [13, 85, 93, 241, 178, 96],
 [21],
 [123, 260],
 [276, 212],
 [251, 113, 89],
 [106, 146, 208, 13, 5],
 [204],
 [19, 104, 166, 260, 121, 198, 142],
 [13, 166, 124, 22],
 [11, 210, 83, 147, 36, 42, 67, 157],
 [76, 288, 239, 276, 30, 144, 190, 276, 270, 211, 216, 120, 279, 21, 159, 295],
 [93, 272, 141],
 [113, 2, 21, 267, 75, 216, 194, 132, 217],
 [21, 58, 108],
 [99, 13, 121, 106, 188],
 [72, 21, 142, 191, 13, 206, 118, 69, 252, 276, 297, 79, 86],
 [148, 21, 13, 107],
 [188, 138, 190, 101],
 [58, 71, 142, 54],
 [120, 251, 68, 109, 66, 21, 26, 6, 93, 192, 157],
 [103, 95, 64, 77

In [149]:
# Embedding Representation

sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding = 'post', maxlen = sent_length)
print(embedded_docs)

[[173 299  13 ...   0   0   0]
 [249 142   0 ...   0   0   0]
 [ 95 135 253 ...   0   0   0]
 ...
 [148  99 229 ...   0   0   0]
 [296  70  28 ...   0   0   0]
 [152  52  82 ...   0   0   0]]


In [281]:
# Creating Model

embedding_vector_features = 40  # Word vector (feature representation) length
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length = sent_length))
model.add(Dropout(0.05))
model.add(LSTM(11))
model.add(Dropout(0.05))
model.add(Dense(1, activation ='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Model: "sequential_72"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_72 (Embedding)    (None, 20, 40)            12000     
                                                                 
 dropout_85 (Dropout)        (None, 20, 40)            0         
                                                                 
 lstm_69 (LSTM)              (None, 11)                2288      
                                                                 
 dropout_86 (Dropout)        (None, 11)                0         
                                                                 
 dense_74 (Dense)            (None, 1)                 12        
                                                                 
Total params: 14300 (55.86 KB)
Trainable params: 14300 (55.86 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [166]:
# Converting embedded docs and dependent variable into array

X_final = np.array(embedded_docs)
Y_final = np.array(Y)

In [162]:
X_final.shape, Y_final.shape

((1000, 20), (1000,))

In [167]:
# Splitting Training and Testing dataset

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_final, Y_final, test_size = 0.20, random_state = 42)

In [282]:
# Training Model

model.fit(X_train2, y_train2, validation_data = (X_test2, y_test2), epochs = 10, batch_size = 50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7db71452e890>

In [283]:
y_pred=model.predict(X_test2)



In [284]:
y_pred=np.where(y_pred > 0.5, 1,0)

In [285]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test2,y_pred)

0.71

In [286]:
from sklearn.metrics import classification_report
print(classification_report(y_test2,y_pred))

              precision    recall  f1-score   support

           0       0.70      0.70      0.70        96
           1       0.72      0.72      0.72       104

    accuracy                           0.71       200
   macro avg       0.71      0.71      0.71       200
weighted avg       0.71      0.71      0.71       200

