In [1]:
import pandas

In [2]:
df = pd.read_csv("Restaurant_Reviews.tsv", sep='\t')

#loading the dataset
# \t will separate the dataset into two columns

<IPython.core.display.Javascript object>

In [3]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [4]:
df.columns

## The names of the columns in the dataset

Index(['Review', 'Liked'], dtype='object')

In [5]:
df.shape

## number of rows and columns in the dataset

(1000, 2)

In [6]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [7]:
# Therefore, the dataset has same number of good and bad reviews

## DATA PREPROCESSING

In [8]:
import nltk
from nltk.corpus import stopwords
import re

from nltk.stem import WordNetLemmatizer

In [9]:
# We imported the necessary libraries to perform Natural Language Processing on the dataset

In [10]:
# Now we will clean the reviews 

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
corpus=[]

In [13]:
for i in range(0,len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['Review'][i])    #review will only have elements in (a-z) or (A-Z)
    review = review.lower()                              #all the text will be lowered
    review = review.split()                              #the sentences will be splitted into words
    review = [lemmatizer.lemmatize(word)  for word in review if word not in set(stopwords.words('english'))]       #applying stop words
    review = ' '.join(review)                            #joining the words after getting filtered through stopwords
    corpus.append(review)                                #adding the words into the corpus list

In [14]:
corpus[0:4]

['wow loved place',
 'crust good',
 'tasty texture nasty',
 'stopped late may bank holiday rick steve recommendation loved']

In [19]:
# Now we will import tf idf to make the bag of words model

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tf = TfidfVectorizer()

In [22]:
x = tf.fit_transform(corpus).toarray()

In [23]:
y = df['Liked']

In [24]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
y

0      1
1      0
2      0
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: Liked, Length: 1000, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

# We imported the train test split to split our dataset into train and test split

In [27]:
# No we will fit the Multinomial Naive Bayes to the training set
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(xtrain, ytrain)

MultinomialNB()

In [28]:
ypred = mnb.predict(xtest)

In [29]:
ypred

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0], dtype=int64)

In [30]:
# we will import the confusion matrix

In [31]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(ytest,ypred)

In [32]:
cm

array([[74, 18],
       [22, 86]], dtype=int64)

In [33]:
# we see that 74+86 reviews are correctly classified

In [34]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(ytest, ypred)

#we import accuracy score to check the accuracy of the model

In [35]:
acc

0.8

# NOW WE WILL MAKE PREDICTIONS

In [36]:
def pred(review):
  review = re.sub('[^a-zA-Z]',' ', string = review)
  review = review.lower()
  review_words = review.split()
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]
  final_review = [lemmatizer.lemmatize(word) for word in review_words]
  final_review = ' '.join(final_review)

  temp = tf.transform([final_review]).toarray()
  return mnb.predict(temp)


#we write a predict sentiment function to further classify a review as positive or negative

In [37]:
# PREDICTION 1

review = 'The food was really good'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Positive Review.


In [42]:
# PREDICTION 2

review = 'The service was really bad'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Negative Review.


In [43]:
# PREDICTION 3

review = 'Absolutely Delicious'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Positive Review.


In [46]:
# PREDICTION 4

review = 'Food quality was below standards'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Negative Review.


In [47]:
# PREDICTION 5

review = 'The food was tasty and the waiters were really nice'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Positive Review.


In [48]:
# PREDICTION 6

review = 'the food was horrible'
if pred(review):
  print('This is a Positive Review.')
else:
  print('This is a Negative Review.')

This is a Negative Review.


### WE CAN SEE THAT ALL THE REVIEWS ARE BEING CLASSIFIED CORRECTLY WITH AN ACCURACY OF 80%

In [49]:
# NOW WE WILL IMPORT PICKLE TO SAVE THE MODEL

In [50]:
import pickle

In [51]:
pickle.dump(tf, open('ressenttf.pkl','wb'))

In [52]:
pickle.dump(mnb, open('resmnbclassifier.pkl','wb'))