# Text Classification Using Bernoulli Naive Bayes algorithm

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')
#\t must be define to read tab separate data

In [4]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df.shape

(1000, 2)

In [7]:
#Now we will check the no. of 0 and 1
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# Data Cleaning

In [13]:
import nltk
##install nltk : pip install nltk

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Ankit
[nltk_data]     Mohari\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
stopWords = set(stopwords.words('english'))
print(stopWords)
#stopWords are connecting words, so we can remove the extra words and punctuation.

{'been', 'the', 'his', 'didn', "isn't", 'hers', 'other', 'does', 'she', 've', 'then', 'y', 'be', 'these', 'from', 'there', 'after', 'each', "haven't", 'out', 'm', 'over', 'don', 'o', "she's", 'further', 's', 'very', 'you', 'theirs', 'do', 'd', 'below', 'some', 'mightn', 'hasn', 'doing', 'should', 'this', 'needn', "you've", 'such', 'was', 'to', 'up', 'are', 'with', 'a', 'whom', "shouldn't", 'most', 'but', 'any', "that'll", 'above', "mustn't", "won't", 'once', 'did', 'being', 'its', 'weren', 'aren', 'your', 'i', 'ours', 'where', 're', "couldn't", 'doesn', 'or', 'won', 'we', 'were', "mightn't", 'me', 'them', 'myself', 'has', 'off', "you're", 'shan', 'on', 'too', 'nor', 'our', 'he', 'here', 'why', 'having', 'is', "should've", 'by', 'at', 'mustn', 'because', 'ma', 'will', 'few', 'had', 'if', 'than', "wouldn't", "don't", 'shouldn', 'now', 'of', 'as', 'can', "didn't", 'under', 'am', 'while', 'same', 'in', "weren't", 'down', "needn't", 'it', 'into', 'before', "doesn't", 'that', 'during', 'thei

In [16]:
import string
#text_cleaning function for removing stop words
def text_cleaning(a):
 cleaning = [char for char in a if char not in string.punctuation]
 #print(cleaning)
 cleaning=''.join(cleaning)
 #print(cleaning)   
 return [word for word in cleaning.split() if word.lower() not in stopwords.words('english')]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_cleaning).fit(df['Review'])
#CountVectorizer will help to convert alphabets into numbers
#we have not use labelencoding because it use when we have single words in the column

In [19]:
print(bow_transformer)

CountVectorizer(analyzer=<function text_cleaning at 0x00000279590E3E50>)


In [21]:
print(len(bow_transformer.vocabulary_))
bow_transformer.vocabulary_

# We have 2159 unique words like, 'Wow' is assign with 442 and so on

2159


{'Wow': 442,
 'Loved': 248,
 'place': 1540,
 'Crust': 112,
 'good': 1079,
 'tasty': 1932,
 'texture': 1945,
 'nasty': 1408,
 'Stopped': 381,
 'late': 1255,
 'May': 265,
 'bank': 541,
 'holiday': 1162,
 'Rick': 342,
 'Steve': 380,
 'recommendation': 1651,
 'loved': 1305,
 'selection': 1755,
 'menu': 1359,
 'great': 1089,
 'prices': 1584,
 'getting': 1061,
 'angry': 480,
 'want': 2074,
 'damn': 792,
 'pho': 1528,
 'Honeslty': 204,
 'didnt': 832,
 'taste': 1928,
 'fresh': 1035,
 'potatoes': 1571,
 'like': 1279,
 'rubber': 1707,
 'could': 752,
 'tell': 1937,
 'made': 1318,
 'ahead': 463,
 'time': 1966,
 'kept': 1238,
 'warmer': 2078,
 'fries': 1040,
 'touch': 1987,
 'Service': 361,
 'prompt': 1596,
 'Would': 441,
 'go': 1072,
 'back': 532,
 'cashier': 663,
 'care': 656,
 'ever': 939,
 'say': 1736,
 'still': 1870,
 'ended': 921,
 'wayyy': 2092,
 'overpriced': 1474,
 'tried': 1999,
 'Cape': 89,
 'Cod': 100,
 'ravoli': 1634,
 'chicken': 688,
 'cranberrymmmm': 768,
 'disgusted': 856,
 'pretty'

In [20]:
title_bow = bow_transformer.transform(df['Review'])
print(title_bow)

# 0 is the row number i.e the very first comment have 248 assigned word, and present once so on.
# 248 is loved so in the first comment we loved and so on , it is display with the frequency

  (0, 248)	1
  (0, 442)	1
  (0, 1540)	1
  (1, 112)	1
  (1, 1079)	1
  (2, 1408)	1
  (2, 1932)	1
  (2, 1945)	1
  (3, 265)	1
  (3, 342)	1
  (3, 380)	1
  (3, 381)	1
  (3, 541)	1
  (3, 1162)	1
  (3, 1255)	1
  (3, 1305)	1
  (3, 1651)	1
  (4, 1089)	1
  (4, 1359)	1
  (4, 1584)	1
  (4, 1755)	1
  (5, 480)	1
  (5, 792)	1
  (5, 1061)	1
  (5, 1528)	1
  :	:
  (997, 1072)	1
  (997, 1199)	1
  (997, 2135)	1
  (998, 288)	1
  (998, 387)	1
  (998, 953)	1
  (998, 1072)	1
  (998, 1420)	1
  (998, 1955)	1
  (998, 1966)	1
  (998, 2020)	1
  (998, 2099)	1
  (998, 2106)	1
  (999, 618)	1
  (999, 679)	1
  (999, 879)	1
  (999, 925)	1
  (999, 1113)	1
  (999, 1274)	1
  (999, 1572)	1
  (999, 1721)	1
  (999, 1966)	1
  (999, 1979)	1
  (999, 2083)	1
  (999, 2138)	1


In [22]:
X = title_bow.toarray()
print(X)
X.shape
# We have converting the data into array
# 1000 rows , 2159 columns , we have 2159 diffrent vocabalury words

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


(1000, 2159)

In [24]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, df.Liked,test_size=0.2, random_state=42)

In [25]:
#from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [26]:
model = BernoulliNB().fit(x_train,y_train)

In [27]:
pred = model.predict(x_test)
print(pred)

[0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 1
 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1 1 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 0
 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 1 0 0 0 0
 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 0
 1 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1
 1 1 0 0 1 0 1 1 0 0 1 1 1 0 1]


In [28]:
from sklearn.metrics import accuracy_score

In [30]:
accuracy_score(pred,y_test)

0.755

In [32]:
from sklearn.metrics import confusion_matrix

In [33]:
conf = confusion_matrix(y_test,pred)
conf

array([[71, 25],
       [24, 80]], dtype=int64)

In [34]:
from sklearn.metrics import classification_report

In [36]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.74        96
           1       0.76      0.77      0.77       104

    accuracy                           0.76       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.76      0.75       200



# Testing our own data

In [43]:
own_data = ["Tasty pork belly"]
print(own_data)

['Tasty pork belly']


In [45]:
test_own_data = bow_transformer.transform(['own_data'])

In [46]:
data_new = test_own_data.toarray()
print("Coverted into array: \n", data_new)

Coverted into array: 
 [[0 0 0 ... 0 0 0]]


In [47]:
print("Shape of the new data: \n", data_new.shape)

Shape of the new data: 
 (1, 2159)


In [48]:
predict_new = model.predict(data_new)

In [49]:
print(predict_new)

[1]


so, 1 is the positive comment