In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv("yelp.csv")
data.head(1)


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0


In [3]:
X = data["text"]
y = data["stars"]
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7500,)
(2500,)
(7500,)
(2500,)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words ="english",min_df=2)
vect.fit(X_train,y_train)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [5]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 4.06 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [7]:
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
print(y_pred_prob)

[2.88698028e-01 4.29513299e-04 6.89144372e-04 ... 1.65236954e-03
 4.27661418e-04 4.71303667e-06]


In [8]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.4844

In [9]:
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm,y_train)
y_pred_class = nb.predict(X_test_dtm)
accuracy =metrics.accuracy_score(y_test,y_pred_class)
print(accuracy)

0.496


In [10]:
print(metrics.confusion_matrix(y_test,y_pred_class))

[[ 84  34  32  24  11]
 [ 36  48  56  71  23]
 [ 14  18  93 196  44]
 [ 17  13  46 571 237]
 [ 17   7  18 346 444]]


In [11]:
X_train_token = vect.get_feature_names()
star_1 = nb.feature_count_[0,:]
star_2 = nb.feature_count_[1,:]
star_3 = nb.feature_count_[2,:]
star_4 = nb.feature_count_[3,:]
star_5 = nb.feature_count_[4,:]

In [12]:
tokens = pd.DataFrame({'token':X_train_token, 'star_1':star_1, 'star_2':star_2,'star_3':star_3,'star_4':star_4,'star_5':star_5 })
print(tokens.head(5))

  token  star_1  star_2  star_3  star_4  star_5
0    00    19.0    28.0    30.0    49.0    33.0
1   000     5.0     2.0     2.0     3.0     7.0
2  00am     3.0     0.0     0.0     2.0     2.0
3  00pm     1.0     0.0     0.0     5.0     5.0
4    01     1.0     0.0     1.0     1.0     2.0


In [13]:
# sum 1 in each row for avoiding to going values infinity
tokens['star_1'] = tokens.star_1 + 1
tokens['star_2'] = tokens.star_2 + 1
tokens['star_3'] = tokens.star_3 + 1
tokens['star_4'] = tokens.star_4 + 1
tokens['star_5'] = tokens.star_5 + 1

In [14]:
# convert all stars  counts into frequencies
tokens['one_star'] = tokens.star_1 / nb.class_count_[0]
tokens['two_star'] = tokens.star_2 / nb.class_count_[1]
tokens['three_star'] = tokens.star_3 / nb.class_count_[2]
tokens['four_star'] = tokens.star_4 / nb.class_count_[3]
tokens['five_star'] = tokens.star_5 / nb.class_count_[4]

In [15]:
# calculate the ratio of five-star to all stars for each token
tokens['five_star_ratio'] = tokens.star_5 / (tokens.star_1+tokens.star_2+tokens.star_3+tokens.star_4+tokens.star_5)

In [16]:
 #sort the DataFrame by five_star_ratio (descending order), and examine the first 5 rows

tokens.sort_values('five_star_ratio', ascending=False).head(5)

Unnamed: 0,token,star_1,star_2,star_3,star_4,star_5,one_star,two_star,three_star,four_star,five_star,five_star_ratio
3953,dr,5.0,10.0,5.0,11.0,91.0,0.008865,0.01443,0.004562,0.004164,0.036327,0.745902
6634,jeff,2.0,1.0,1.0,5.0,25.0,0.003546,0.001443,0.000912,0.001893,0.00998,0.735294
2815,cone,1.0,1.0,1.0,1.0,11.0,0.001773,0.001443,0.000912,0.000379,0.004391,0.733333
3553,dentist,1.0,6.0,1.0,2.0,27.0,0.001773,0.008658,0.000912,0.000757,0.010778,0.72973
2425,christopher,1.0,1.0,1.0,5.0,21.0,0.001773,0.001443,0.000912,0.001893,0.008383,0.724138


In [22]:
#count the number of each class
nb.class_count_

array([ 564.,  693., 1096., 2642., 2505.])

In [23]:
#count the number of each class from y_train data

value_of_each_class=y_train.value_counts()
print(value_of_each_class)

4    2642
5    2505
3    1096
2     693
1     564
Name: stars, dtype: int64


In [32]:
#input a rteview for predicting star
text = ["food was spicy and not good enough"]
text_dtm = vect.transform(text)
predict = nb.predict(text_dtm)
print(predict)


[4]
