In [1]:
#Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
PorterStemmer()
ps = PorterStemmer()
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
NV = GaussianNB()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Read the data, ingore double quotations
data = pd.read_csv("reviews.tsv", sep="\t", quoting=3)
#Display the first rows
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
#Get info about the dataset
data.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [4]:
#Get the amount of positive(1) and negative(0) reviews
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [5]:
#Data Preprocessing
def data_process(file):
  corpus = []

  for record in range(len(data)):
      #keep only aplhabetical characters and convert to lowercase
      record = re.sub('[^a-zA-z]', ' ', data['Review'][record]).lower()
      #apply tokenization
      record = record.split()
      #remove stopwords
      record = [token for token in record if token not in stopwords.words('english')]
      #apply stemming
      record = [ps.stem(token) for token in record]

      record = " ".join(record)
      corpus.append(record)

  return corpus


In [6]:
corpus = data_process(data)

In [7]:
#Feature engineering 
#BOW method (Bag Of Words)
x = cv.fit_transform(corpus).toarray()
y = data.iloc[:, 1].values

In [8]:
print(x.shape)
print(y.shape)

(1000, 1500)
(1000,)


In [9]:
#Apply Naive Bayes Classification
#Split the dataset into train/test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [10]:
print(x_train.shape)
print(x_test.shape)

(800, 1500)
(200, 1500)


In [11]:
#Train the model
NV.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [12]:
#Run predictions
y_pred = NV.predict(x_test)

In [13]:
#Measure model's performance
accuracy = accuracy_score(y_test, y_pred)
print(accuracy*100)

73.0


In [14]:
classification_report = classification_report(y_test, y_pred)
print(classification_report)

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200

