In [44]:
import numpy as np
import pandas as pd
file_path = '/content/drive/MyDrive/Forsk coding school code practices/Restaurant_Reviews.tsv'
df = pd.read_csv(file_path, delimiter = '\t', quoting = 3) #tsv file - \t or tab to read it

In [76]:
df.columns.to_list()
df.shape
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [46]:
features = df['Review']
labels = df['Liked']

In [47]:
#Applying train_test_split on dataset
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [48]:
#Creating "bag of words model" with Tfidfvectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(min_df=5) #min_df --> to add those words which are present in features atleast more than min-df

features_train_vectorized = tf.fit_transform(features_train) #fitiing the trained data

features_train_vectorized.shape

features_train_vectorized = features_train_vectorized.toarray() #converting to numpy array

features_train_vectorized.shape

(800, 291)

In [17]:
features_train_vectorized[0:2] #1st and 2nd row

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [49]:
#Training and classificatin using Deep learning model
from keras.models import Sequential #Sequential - model
from keras.layers.core import Dense, Dropout, Activation #dense - layering/layers
from keras.optimizers import adadelta_v2, adam_v2, rmsprop_v2
from keras.utils import np_utils

In [115]:
model = Sequential()
model.add(Dense(200, input_shape=(291,))) #nodes - 200, input_shape - input layer/total words in bag of model/vectorized features shape
model.add(Activation('relu')) #relu - algorithm
model.add(Dropout(0.5))

#2nd layer
model.add(Dense(100)) #nodes - 100
model.add(Activation('relu')) #relu activation
model.add(Dropout(0.5))

#3rd layer
model.add(Dense(40)) #nodes - 40
model.add(Activation('relu')) #relu activation
model.add(Dropout(0.5))

#4th layer
model.add(Dense(10)) #nodes - 10
model.add(Activation('relu')) #relu activation
model.add(Dropout(0.5))

#5th layer/output layer/last layer
model.add(Dense(1)) #nodel - 1, for output layer
model.add(Activation('sigmoid')) #sigmoid activation in output/binary classification

model.compile(loss='binary_crossentropy', optimizer = 'adam') #adam - optimizer and loss is binary_crossentropy

model.fit(features_train_vectorized, labels_train, batch_size = 100, epochs = 20) #fitting the model

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb9c6998790>

In [116]:
#predictions
"""
Please use instead:* "np.argmax(model.predict(x), axis=-1)",   if your model does multi-class classification   
(e.g. if it uses a 'softmax' last-layer activation).* "(model.predict(x) > 0.5).astype("int32")",   if your model does binary classification
(e.g. if it uses a 'sigmoid' last-layer activation)
"""
labels_train_prediction = (model.predict(features_train_vectorized) > 0.5).astype("int32")
labels_test_prediction = (model.predict((tf.transform(features_test).toarray()))> 0.5).astype("int32")

In [92]:
print(list(zip(labels_train_prediction[0:5], labels_train[0:5])))
print(list(zip(labels_test_prediction[0:5], labels_test[0:5])))

[(array([0], dtype=int32), 0), (array([1], dtype=int32), 1), (array([0], dtype=int32), 0), (array([0], dtype=int32), 0), (array([1], dtype=int32), 1)]
[(array([0], dtype=int32), 1), (array([1], dtype=int32), 1), (array([1], dtype=int32), 1), (array([1], dtype=int32), 1), (array([1], dtype=int32), 1)]


In [87]:
#print(len(labels_train))
#print(len(labels_train_prediction))
#print(len(labels_test))
#print(len(labels_test_prediction))

In [117]:
#Accuracy/score
from sklearn.metrics import accuracy_score
print('Training score is:', accuracy_score(labels_train, labels_train_prediction)*100,'%')
print('Testing score is:', accuracy_score(labels_test, labels_test_prediction)*100,'%')

Training score is: 94.25 %
Testing score is: 76.0 %


In [95]:
#Accuracy by confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_train, labels_train_prediction)

score_train = (cm[0][0] + cm[1][1])/cm.sum()

cm2 = confusion_matrix(labels_test, labels_test_prediction)

score_test = (cm2[0][0] + cm2[1][1])/cm2.sum()

print('confusion matrix for the training set is:\n',cm)
print('Training Accuracy for the model is:',score_train*100,'%')

print('confusion matrix for the testing set is:\n',cm2)
print('Testing Accuracy for the model is:',score_test*100,'%')

confusion matrix for the training set is:
 [[401   3]
 [  1 395]]
Training Accuracy for the model is: 99.5 %
confusion matrix for the testing set is:
 [[83 13]
 [35 69]]
Testing Accuracy for the model is: 76.0 %


In [77]:
#Testing for a review data sample
data = ['This restaurant has less number of workers which makes customers wait for longer than expected.'] #give data as alist

data = tf.transform(data) #sparse matrix type - convert to numpy array - use toarray()/todense()

data = data.toarray()

prediction = (model.predict(data) > 0.5).astype("int32")

print(prediction[0][0]) #wrong prediction is coming, more datasets may be required.

1
