### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Importing dataset




In [2]:
dataset = pd.read_csv('yelp_review.csv')

In [3]:
dataset.shape

(5261668, 9)

In [4]:
dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [5]:
dataset = dataset.loc[dataset.index <= 49999]

In [6]:
dataset.shape

(50000, 9)

In [7]:
dataset.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'date', 'text',
       'useful', 'funny', 'cool'],
      dtype='object')

In [18]:
dataset = dataset[['text','stars']]

In [19]:
dataset.shape

(50000, 2)

In [20]:
dataset.columns

Index(['text', 'stars'], dtype='object')

### Data Preprocessing

In [21]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to C:\Users\Digvijay
[nltk_data]     Mohite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
corpus=[]

for i in range(0, 50000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [23]:
corpus

['super simpl place amaz nonetheless around sinc still serv thing start bologna salami sandwich mustard staff help friendli',
 'small unassum place chang menu everi often cool decor vibe insid seat restaur call reserv beef tartar pork belli start salmon dish lamb meal main everyth incred could go length list ingredi realli make dish amaz honestli need go bit outsid downtown montreal take metro less minut walk station',
 'lester locat beauti neighborhood sinc known smoke meat deli brisket sandwich come montreal got seat outsid go along insid smoke meat qualiti tast schwartz find less tourist lester well',
 'love come ye place alway need floor swept give peanut shell alway bit dirti food speak good burger made order meat put grill order sandwich get small burger mean patti regular patti burger twice delici get cajun fri add bit spice whatev size order alway throw fri lot fri bag',
 'chocol almond croissant amaz light butteri oh chocolati look light breakfast head perfect spot coffe latt 

### Data transformation

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1420)

In [25]:
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [26]:
# Saving BoW dictionary to later use in prediction
import pickle
bow_path = 'c1_BoW_Sentiment_Model.pkl'
pickle.dump(cv, open(bow_path, "wb"))

### Dividing dataset into training and test set

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

### Model fitting (Naive Bayes)

In [28]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [29]:
# Exporting NB Classifier to later use in prediction
import joblib
joblib.dump(classifier, 'c2_Classifier_Sentiment_Model')

['c2_Classifier_Sentiment_Model']

###Model performance

In [30]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[1157  278   96   47  344]
 [ 312  282  236  118  304]
 [ 195  239  495  385  627]
 [ 180  223  553  945 1916]
 [ 365  193  317  696 4497]]


0.49173333333333336