In [1]:
import numpy as np
import pandas as pd

In [2]:
yelp_data = pd.read_csv("yelp.csv")

In [3]:
yelp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB


In [4]:
yelp_data.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [5]:
X = yelp_data['text']

In [6]:
y = yelp_data['stars']

In [7]:
X.head()

0    My wife took me here on my birthday for breakf...
1    I have no idea why some people give bad review...
2    love the gyro plate. Rice is so good and I als...
3    Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4    General Manager Scott Petello is a good egg!!!...
Name: text, dtype: object

In [8]:
y.head()

0    5
1    5
2    4
3    5
4    5
Name: stars, dtype: int64

In [9]:
len(y)

10000

In [10]:
len(X)

10000

In [11]:
print(y.value_counts())

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64


# splitting into training and testing

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,stratify = y,random_state = 42)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000,)
(2000,)
(8000,)
(2000,)


# NLP 
## transformations

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vect = CountVectorizer()

In [17]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

## tf-idf transformer

In [19]:
from sklearn.feature_extraction.text import TfidfTransformer

In [20]:
tf_idf = TfidfTransformer(use_idf=True,norm = 'l2',smooth_idf=True)

In [21]:
tf_idf.fit(X_train_vect)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [22]:
X_train_idf = tf_idf.transform(X_train_vect)
X_test_idf = tf_idf.transform(X_test_vect)

In [23]:
print(X_test_idf.shape)
print(X_train_idf.shape)

(2000, 26349)
(8000, 26349)


# logistic regression 

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
log_reg = LogisticRegression(multi_class = 'multinomial',solver = 'lbfgs')

In [26]:
log_reg.fit(X_train_idf,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
log_reg_score = log_reg.score(X_test_idf,y_test)

In [28]:
log_reg_score

0.5235

In [29]:
y_pred = log_reg.predict(X_test_idf)

In [30]:
from sklearn.metrics import confusion_matrix,recall_score,precision_score,f1_score

In [31]:
confusion_matrix(y_true=y_test,y_pred = y_pred)

array([[ 61,  26,   8,  26,  29],
       [ 18,  36,  47,  61,  23],
       [  7,  11,  73, 162,  39],
       [  2,   3,  38, 453, 209],
       [  3,   0,   6, 235, 424]])

In [32]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred,average= 'weighted'))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred,average = 'weighted'))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred,average = 'weighted'))

precision: 0.5220767310469372
recall: 0.5235
f1-score: 0.5073925298452002


# 2 gram

In [33]:
vect2g = CountVectorizer(ngram_range=(1,2))
vect2g.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [34]:
X_train_2g = vect2g.transform(X_train)

In [35]:
X_test_2g = vect2g.transform(X_test)

In [36]:
X_train_tf2g = tf_idf.fit_transform(X_train_2g)

In [37]:
X_test_tf2g =tf_idf.transform(X_test_2g)

# logistic regression

In [38]:
log_reg2g = LogisticRegression()

In [39]:
log_reg2g.fit(X_train_tf2g,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
log_reg2g_score = log_reg2g.score(X_test_tf2g,y_test)

In [41]:
log_reg2g_score

0.512

In [43]:
y_pred_2g = log_reg2g.predict(X_test_tf2g)

In [44]:
confusion_matrix(y_true = y_test,y_pred = y_pred_2g)

array([[ 37,  13,   4,  56,  40],
       [ 11,  16,  30,  99,  29],
       [  3,   4,  37, 212,  36],
       [  0,   1,  10, 512, 182],
       [  1,   0,   0, 245, 422]])

In [53]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_2g,average= 'weighted'))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_2g,average = 'weighted'))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_2g,average = 'weighted'))

precision: 0.5229538568978928
recall: 0.512
f1-score: 0.4720276237445014


# random forest 2g

In [46]:
from sklearn.ensemble import RandomForestClassifier
rnd_2g = RandomForestClassifier()

In [47]:
rnd_2g.fit(X_train_tf2g,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
rnd_2g_score = rnd_2g.score(X_test_tf2g,y_test)

In [49]:
rnd_2g_score

0.411

In [50]:
y_pred_rnd_2g = rnd_2g.predict(X_test_tf2g)

In [51]:
confusion_matrix(y_true=y_test,y_pred=y_pred_rnd_2g)

array([[ 23,  15,  11,  66,  35],
       [  6,  11,  30, 103,  35],
       [  6,   9,  34, 199,  44],
       [  5,  10,  48, 432, 210],
       [  7,   4,  25, 310, 322]])

In [52]:
print("precision:", precision_score(y_true=y_test, y_pred=y_pred_rnd_2g,average= 'weighted'))
print("recall:", recall_score(y_true=y_test, y_pred=y_pred_rnd_2g,average = 'weighted'))
print("f1-score:", f1_score(y_true=y_test, y_pred=y_pred_rnd_2g,average = 'weighted'))

precision: 0.3946801356479761
recall: 0.411
f1-score: 0.3802701468050653


In [67]:
X_train_idf_3D = X_train_idf.toarray()
print(type(X_train_idf_3D))
#type(X_train_idf)
#X_train_idf = X_train_idf[...,np.newaxis]

MemoryError: 

# RNN

In [60]:
from tensorflow import keras
model_simpleRNN = keras.models.Sequential()
model_simpleRNN.add(keras.layers.Embedding(input_dim=10000, output_dim=10))
model_simpleRNN.add(keras.layers.SimpleRNN(25))
model_simpleRNN.add(keras.layers.Dense())

model_simpleRNN.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model_simpleRNN.summary()

ValueError: Input 0 of layer simple_rnn_2 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 26349]