In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd 
data = pd.read_csv("../data/food-review/Reviews.csv",index_col="Id")

In [3]:
data.columns

Index(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [4]:
data.shape 

(568454, 9)

In [5]:
data = data.sample(5000)

In [6]:
review_data = data[['Text','Score']]

In [7]:
review_data[['Sentiment']] = review_data.Score.map(lambda x: 0 if x < 3 else 1)

In [8]:
review_data["Sentiment"].value_counts()

1    4264
0     736
Name: Sentiment, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_data.Text,review_data.Sentiment,
                                                    test_size=.20)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

USE_HASHING=False

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False
    )
    X_train = vectorizer.transform(X_train)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(X_train)
    
X_test = vectorizer.transform(X_test)

# GaussianNB

In [11]:
%%time 
from sklearn.naive_bayes import GaussianNB
model = None
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

Wall time: 1.06 s


GaussianNB()

In [12]:
%%time 
from sklearn.metrics import accuracy_score
pred_train=model.predict(X_train.toarray())
print('accuracy_score on training data : ',accuracy_score(y_train,pred_train))

pred_test=model.predict(X_test.toarray())
print('accuracy_score on test data: ',accuracy_score(y_test,pred_test))

accuracy_score on training data :  0.89725
accuracy_score on test data:  0.716
Wall time: 1.74 s


# BernoulliNB

In [13]:
%%time 
##  word occurrence vectors (rather than word count vectors) 
from sklearn.naive_bayes import BernoulliNB
model = None
model = BernoulliNB()
model.fit(X_train, y_train)

Wall time: 4 ms


BernoulliNB()

In [14]:
from sklearn.metrics import accuracy_score
pred_train=model.predict(X_train)
print('accuracy_score on training data : ',accuracy_score(y_train,pred_train))

pred_test=model.predict(X_test)
print('accuracy_score on test data: ',accuracy_score(y_test,pred_test))

accuracy_score on training data :  0.866
accuracy_score on test data:  0.845


# MultinomialNB

In [15]:
from sklearn.naive_bayes import MultinomialNB
model = None
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [16]:
%%time 
from sklearn.metrics import accuracy_score
pred_train=model.predict(X_train)
print('accuracy_score on training data : ',accuracy_score(y_train,pred_train))

pred_test=model.predict(X_test)
print('accuracy_score on test data: ',accuracy_score(y_test,pred_test))

accuracy_score on training data :  0.8525
accuracy_score on test data:  0.856
Wall time: 3.97 ms


# ComplementNB

In [17]:
%%time 
# CNB is an adaptation of the standard multinomial naive Bayes (MNB) algorithm 
# that is particularly suited for imbalanced data sets.
from sklearn.naive_bayes import ComplementNB
model = None
model = ComplementNB()
model.fit(X_train, y_train)

Wall time: 4 ms


ComplementNB()

In [18]:
from sklearn.metrics import accuracy_score
pred_train=model.predict(X_train)
print('accuracy_score on training data : ',accuracy_score(y_train,pred_train))

pred_test=model.predict(X_test)
print('accuracy_score on test data: ',accuracy_score(y_test,pred_test))

accuracy_score on training data :  0.87325
accuracy_score on test data:  0.859
