## Yelp Data Challenge - Restaurant Recommender with Graphlab Create (Environment: Python 2)

Yi Li

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [3]:
df = pd.read_csv('../dataset/last_2_years_restaurant_reviews.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ
1,2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,0,2aeNFntqY2QDZLADNo8iQQ


In [4]:
# Get business_id, user_id, stars for recommender
names = ['business_id', 'name', 'user_id', 'stars']
df_new = df[names]
df_new.head()

Unnamed: 0,business_id,name,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,oFyOUOeGTRZhFPF9uTqrTQ,5
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,2aeNFntqY2QDZLADNo8iQQ,4
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,gmPP4YFrgYsYQqPYokMgFA,5
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,aVOGlN9fZ-BXcbtj6dbf0g,5
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,KC8H7qTZVPIEnanw9fG43g,5


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

In [5]:
print(df_new['user_id'].value_counts().describe())
print('\n')
print((df_new['user_id'].value_counts()>5).describe())
print('\n')
print((df_new['user_id'].value_counts()>10).describe())

count    227241.000000
mean          2.269626
std           4.724827
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         748.000000
Name: user_id, dtype: float64


count     227241
unique         2
top        False
freq      212566
Name: user_id, dtype: object


count     227241
unique         2
top        False
freq      222081
Name: user_id, dtype: object


In [6]:
# only keep the users who have more than 10 reviews
df_filter = df_new.groupby('user_id').filter(lambda x: len(x) > 10)
df_filter.shape

(117389, 4)

In [7]:
len(df_filter['user_id'].unique()), len(df_filter['business_id'].unique())

(5160, 4394)

In [8]:
117389/(5160* 4394) # still a very sparse matrix

0

#### Load data in Dato's SFrame type

In [11]:
import graphlab

sf = graphlab.SFrame(df_filter[['user_id', 'business_id', 'stars']])

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1522351067.log


This non-commercial license of GraphLab Create for academic use is assigned to yl808@georgetown.edu and will expire on March 29, 2019.


In [28]:
sf.head()

user_id,business_id,stars
oFyOUOeGTRZhFPF9uTqrTQ,--9e1ONYQuAa-CB_Rrw7Tw,5
C6kw0Rny7jZAGjTj0MWA3Q,--9e1ONYQuAa-CB_Rrw7Tw,5
lUPSEbFRd3jDOM5agGnjWQ,--9e1ONYQuAa-CB_Rrw7Tw,4
JaqcCU3nxReTW2cBLHounA,--9e1ONYQuAa-CB_Rrw7Tw,5
n86B7IkbU20AkxlFX_5aew,--9e1ONYQuAa-CB_Rrw7Tw,4
3o8c_R-Kv5UOoXXuMEOZAw,--9e1ONYQuAa-CB_Rrw7Tw,3
611MiIXJkXM82I1y3Hg9eA,--9e1ONYQuAa-CB_Rrw7Tw,4
HVJgTH5qu0goywOHNpOjPA,--9e1ONYQuAa-CB_Rrw7Tw,5
togwFqr0eHwect2P2eZXZQ,--9e1ONYQuAa-CB_Rrw7Tw,5
y4O_c6UUAAtPb3Uk-T4t8A,--9e1ONYQuAa-CB_Rrw7Tw,5


##  Matrix Factorization recommender

Take a look at Graphlab Create examples

#### Create a matrix factorization model

In [15]:
rec = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False)

#### Call the predict method on your input data to get the predicted rating for a user of a restaurant.

In [32]:
# get the user_id, business_id, and the original rating
sf[0]

{'business_id': '--9e1ONYQuAa-CB_Rrw7Tw',
 'stars': 5,
 'user_id': 'oFyOUOeGTRZhFPF9uTqrTQ'}

In [33]:
one_datapoint_sf = graphlab.SFrame({'user_id': ['oFyOUOeGTRZhFPF9uTqrTQ'], 
                                    'business_id': ['--9e1ONYQuAa-CB_Rrw7Tw']})
one_datapoint_sf

business_id,user_id
--9e1ONYQuAa-CB_Rrw7Tw,oFyOUOeGTRZhFPF9uTqrTQ


In [34]:
# see if the predicted rating makes sense
rec.predict(one_datapoint_sf)[0]

5.190843874615112

#### On the returned model object, call the list_fields method to see what kind of data is stored for your model.

In [35]:
rec.list_fields()

['adagrad_momentum_weighting',
 'additional_iterations_if_unhealthy',
 'binary_target',
 'coefficients',
 'data_load_time',
 'init_random_sigma',
 'item_id',
 'item_side_data_column_names',
 'item_side_data_column_types',
 'linear_regularization',
 'max_iterations',
 'model_name',
 'nmf',
 'num_factors',
 'num_features',
 'num_item_side_features',
 'num_items',
 'num_observations',
 'num_tempering_iterations',
 'num_user_side_features',
 'num_users',
 'observation_data_column_names',
 'random_seed',
 'regularization',
 'regularization_type',
 'sgd_convergence_interval',
 'sgd_convergence_threshold',
 'sgd_max_trial_iterations',
 'sgd_sampling_block_size',
 'sgd_step_adjustment_interval',
 'sgd_step_size',
 'sgd_trial_sample_minimum_size',
 'sgd_trial_sample_proportion',
 'side_data_factorization',
 'solver',
 'step_size_decrease_rate',
 'target',
 'tempering_regularization_start_value',
 'track_exact_loss',
 'training_rmse',
 'training_stats',
 'training_time',
 'user_id',
 'user_side_

#### Inspect the output of get('coefficients') to see what information your model uses.

In [36]:
rec['coefficients'] 

{'business_id': Columns:
 	business_id	str
 	linear_terms	float
 	factors	array
 
 Rows: 4394
 
 Data:
 +------------------------+--------------+-------------------------------+
 |      business_id       | linear_terms |            factors            |
 +------------------------+--------------+-------------------------------+
 | --9e1ONYQuAa-CB_Rrw7Tw |     0.0      | [0.0900035127997, -0.07425... |
 | -1m9o3vGRA8IBPNvNqKLmA |     0.0      | [1.24483847618, 3.63663077... |
 | -1vfRrlnNnNJ5boOVghMPA |     0.0      | [-0.0314954519272, -0.0948... |
 | -3zffZUHoY8bQjGfPSoBKQ |     0.0      | [0.626619040966, -0.141688... |
 | -8R_-EkGpUhBk55K9Dd4mg |     0.0      | [-0.0709431096911, 0.12764... |
 | -9YyInW1wapzdNZrhQJ9dg |     0.0      | [0.797298967838, -0.018459... |
 | -AD5PiuJHgdUcAK-Vxao2A |     0.0      | [-0.11172413826, -0.289737... |
 | -ADtl9bLp8wNqYX1k3KuxA |     0.0      | [0.0426351353526, 0.230083... |
 | -BS4aZAQm9u41YnB9MUASA |     0.0      | [0.0907431989908, 0.029872...

In [50]:
item_sf = rec['coefficients']['business_id']
user_sf = rec['coefficients']['user_id']
print(len(item_sf), len(user_sf))
print(len(item_sf['factors'][0]), len(user_sf['factors'][0])) 

(4394, 5160)
(8, 8)


#### Without using the predict method, compute the predicted rating

In [48]:
item_array = item_sf[item_sf['business_id'] == '--9e1ONYQuAa-CB_Rrw7Tw']['factors'][0]
user_array = user_sf[user_sf['user_id'] == 'oFyOUOeGTRZhFPF9uTqrTQ']['factors'][0]
intercept = rec['coefficients']['intercept']
print("rating:", np.dot(item_array, user_array) + intercept)    # 4

('rating:', 5.1908438213766566)


#### The intercept term is the scaling factor. We can compute the value by taking the average of all the ratings in the original dataset.

In [53]:
print("intercept:", intercept)
print("average:", np.average(sf['stars']))

('intercept:', 3.8195827547725827)
('average:', 3.8195827547725938)


#### Call the predict method on your input data to get the predicted ratings, and verify that the RMSE reported by the model diagnostics is correct.

In [56]:
from sklearn.metrics import mean_squared_error

predictions = rec.predict(sf)
rmse = np.sqrt(mean_squared_error(sf['stars'], predictions))

print("graphlab's reported rmse:", rec['training_rmse'])
print("calculated rmse:", rmse)

("graphlab's reported rmse:", 0.5145963002395231)
('calculated rmse:', 0.51459630023952319)


In [60]:
pd.Series(rec.predict(sf)).describe()

count    117389.000000
mean          3.816336
std           1.105862
min          -0.364522
25%           3.254590
50%           4.005467
75%           4.660924
max           6.756745
dtype: float64

In [58]:
pd.Series(sf['stars']).describe()

count    117389.000000
mean          3.819583
std           1.219838
min           1.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
dtype: float64

### Regularization - graphlab provides two regularization parameters.
The parameter regularization controls the value of lambda. Using what you know about regularization from linear regression, what effect would you expect this to have on solutions? What would you expect to see in the difference of training RMSE between setting this parameter to 0 or 0.1? Try it.

In [66]:
random_seed = 0
regularization_param = 1e-6
rec2 = graphlab.recommender.factorization_recommender.create(
            sf,
            user_id='user_id',
            item_id='business_id',
            target='stars',
            solver='als',
            side_data_factorization=False,
            regularization=regularization_param,
            random_seed=random_seed);

('training rmse with regularization 1e-06:', 0.5081707245885436)


In [67]:
print("training rmse with regularization %s:"%regularization_param, rec2['training_rmse'])

('training rmse with regularization 1e-06:', 0.5081707245885436)
