In [207]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

In [208]:
review_dataset = pd.read_csv('/content/drive/MyDrive/ML datasets/Sentiment Analysis/a1_RestaurantReviews_HistoricDump.tsv', delimiter = '\t', quoting = 3)

In [209]:
review_dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [210]:
review_dataset.shape

(900, 2)

## Data Preprocessing

In [211]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [212]:
review_dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

## Now time for stemming

In [213]:
port_stem = PorterStemmer()

In [214]:
def Stemming(content):
  review = re.sub('[^a-zA-Z]', ' ', content)
  review = review.lower()
  review = review.split()
  review = [port_stem.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  return review

In [215]:
review_dataset['Review'] = review_dataset['Review'].apply(Stemming)

In [216]:
print(review_dataset)

                                                Review  Liked
0                                       wow love place      1
1                                           crust good      0
2                                   tasti textur nasti      0
3    stop late may bank holiday rick steve recommen...      1
4                              select menu great price      1
..                                                 ...    ...
895         want first say server great perfect servic      1
896                                  pizza select good      1
897                                strawberri tea good      1
898              highli unprofession rude loyal patron      0
899                                overal great experi      1

[900 rows x 2 columns]


## Data Transformation

Now we will convert our cleaned dataset into bag of words representation

In [217]:
count_vectorizer = CountVectorizer(max_features = 1420)

In [218]:
encoder = OneHotEncoder()

In [219]:
x = count_vectorizer.fit_transform(review_dataset['Review']).toarray() #this is important cause it converts the words into numerical values
y = review_dataset.iloc[:, -1].values

In [220]:
print(y)

[1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1
 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1
 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0
 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1
 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1
 0 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1
 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1
 1 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0
 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0
 1 0 0 1 1 1 0 0 1 1 1 0 

In [221]:
# Saving Bag of words dictionary to later use in prediction
import pickle
bow_path = '/content/drive/MyDrive/ML datasets/Sentiment Analysis/c1_BoW_Sentiment_Model.pkl'
pickle.dump(count_vectorizer, open(bow_path, 'wb'))

In [222]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state = 0)

# Model Fitting

In [223]:
y_train.shape

(720,)

In [224]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [225]:
# Exporting NB Classifier to later use in prediction
import joblib
joblib.dump(classifier, 'c2_Classifier_Sentiment_Model')

['c2_Classifier_Sentiment_Model']

In [226]:
pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, pred)
print(cm)
accuracy = accuracy_score(y_test, pred)
print(accuracy)

[[68 10]
 [46 56]]
0.6888888888888889
