## Yelp Data Challenge - Restaurant Recommender

Yi Li

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('../dataset/last_2_years_restaurant_reviews.csv')

In [3]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ
1,2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,0,2aeNFntqY2QDZLADNo8iQQ


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [58]:
# Get business_id, user_id, stars for recommender
names = ['business_id', 'name', 'user_id', 'stars']
df_new = df[names]

In [59]:
df_new.head()

Unnamed: 0,business_id,name,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,oFyOUOeGTRZhFPF9uTqrTQ,5
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,2aeNFntqY2QDZLADNo8iQQ,4
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,gmPP4YFrgYsYQqPYokMgFA,5
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,aVOGlN9fZ-BXcbtj6dbf0g,5
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,KC8H7qTZVPIEnanw9fG43g,5


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?

A: We can recommend the most popular restaurants to these users. 

In [60]:
len(df_new['user_id'].unique()), len(df_new['business_id'].unique())

(227241, 4832)

In [61]:
515752/(227241*4832) # the matrix is very sparse

0.00046970729299240636

In [62]:
print(df_new['user_id'].value_counts().describe())
print('\n')
print((df_new['user_id'].value_counts()>5).describe())
print('\n')
print((df_new['user_id'].value_counts()>10).describe())

count    227241.000000
mean          2.269626
std           4.724827
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         748.000000
Name: user_id, dtype: float64


count     227241
unique         2
top        False
freq      212566
Name: user_id, dtype: object


count     227241
unique         2
top        False
freq      222081
Name: user_id, dtype: object


In [63]:
# only keep the users who have more than 10 reviews
df_filter = df_new.groupby('user_id').filter(lambda x: len(x) > 10)
df_filter.shape

(117389, 4)

In [64]:
len(df_filter['user_id'].unique()), len(df_filter['business_id'].unique())

(5160, 4394)

In [65]:
117389/(5160* 4394) # still a very sparse matrix

0.0051774706876537065

#### Create utility matrix from records

In [66]:
df_utility = pd.pivot_table(data=df_filter, 
                            values='stars', 
                            index='user_id', 
                            columns='business_id',
                            fill_value=0)

In [67]:
df_utility.shape

(5160, 4394)

In [68]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ZNfWKj1VyVElRx6-g1fg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
-05XqtNjcBq19vh2CVJN8g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-2gOxVWcnBr5DclrrsWXCA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-4JDJeFS0YAYSiSvIshGLQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-50XWnmQGqBgEI-9ANvLlg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(df_utility.T)

In [70]:
item_sim_mat.shape

(4394, 4394)

In [71]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 50
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [72]:
neighborhoods.shape

(4394, 50)

In [73]:
neighborhoods[:2]

array([[ 494, 3636, 3713,    3, 1784, 1291,  797, 1760, 1450, 4236,  386,
         585, 1321, 2495, 1796, 1623, 3813, 1561, 3009,  399, 2870, 1542,
         794, 3514, 2530, 4032,  942, 2214, 2037,  519, 3857, 3173, 2302,
        1967, 1852, 3014, 2981, 1808, 2677, 3021,  217, 3072, 2732, 3796,
        1214,  870, 3865, 1498,  199,    0],
       [3511,  534, 1196, 4281, 2557, 1923, 1205, 4259, 4264, 1551, 3021,
        4207,  431, 1030, 2811, 2507, 2415, 1436, 3298, 1319,  152, 2923,
         922, 3530, 2981,  600,  585,  788, 2419, 2195, 1659, 1841, 2015,
        3881, 4203, 3581, 3841, 2718, 3574, 1991, 2399, 2037, 2302, 3455,
        3909, 3746, 2187, 2439, 3011,    1]])

In [84]:
# Let's pick a lucky user
user_id = 100

In [85]:
n_users = df_utility.shape[0]
n_items = df_utility.shape[1]

from time import time
start_time = time()
items_rated_by_this_user = df_utility.values[user_id].nonzero()[0]

# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection operation 
    out[item_to_rate] = sum(df_utility.values[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items]) / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

[ 5.  0.  0. ...,  0.  0.  0.]
Execution time: 0.180905 seconds


  


In [86]:
pred_ratings.shape

(4394,)

In [87]:
# Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[-n:]

[1948, 1208, 3857, 3594, 3106, 271, 2999, 3444, 1108, 2646]

In [88]:
[df_utility.columns[i] for i in unrated_items_by_pred_rating[-n:]]

['RSOinkOUpxm0mGw1IWr4Xw',
 'GU7ww3eoGTcXAze8VJtC7g',
 'rcaPajgKOJC2vo_l3xa42A',
 'nu1GQ_Q43JqzrWMZE_dE1Q',
 'gqELGWcpWepjV33yzc4Diw',
 '2sx52lDoiEtef7xgPCaoBw',
 'f9sU31meK0bqAD7922sCog',
 'l_GV0hgEoTUf70uJVT0_hg',
 'EwUM6gGMBhrjOef0wmqKJQ',
 'aC4acQbkuD7_JizCiFBuDQ']

In [89]:
[df_filter.name[df_filter.business_id == df_utility.columns[i]].unique() 
 for i in unrated_items_by_pred_rating[-n:]]

[array(['Katsuya'], dtype=object),
 array(['Fin'], dtype=object),
 array(['Bouchon at the Venezia Tower'], dtype=object),
 array(['Searsucker'], dtype=object),
 array(['Santa Ana Cafe'], dtype=object),
 array(['VegeNation'], dtype=object),
 array(['Sage'], dtype=object),
 array(['Eat.'], dtype=object),
 array(["Jared's Old Fashioned"], dtype=object),
 array(['The Paiza Club'], dtype=object)]

## 3. Use non-negative matrix factorization (NMF)

In [97]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=10)
nmf.fit(df_utility)
W = nmf.transform(df_utility)
H = nmf.components_

In [125]:
nmf.reconstruction_err_

1322.5368422289296

In [124]:
# Make interpretable
W, H = (np.around(x,2) for x in (W,H))
W = pd.DataFrame(W, index=df_utility.index)
H = pd.DataFrame(H, columns=df_utility.columns)
W.shape, H.shape

((5160, 10), (10, 4394))

In [119]:
# Verify reconstruction
np.around(W.dot(H),2).head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ZNfWKj1VyVElRx6-g1fg,0.06,0.02,0.0,0.06,0.01,0.0,0.02,0.0,0.0,0.02,...,0.04,0.0,0.0,0.32,0.0,0.05,0.01,0.0,0.0,0.07
-05XqtNjcBq19vh2CVJN8g,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.02,...,0.02,0.0,0.01,0.04,0.0,0.01,0.0,0.0,0.02,0.02
-2gOxVWcnBr5DclrrsWXCA,0.02,0.0,0.01,0.0,0.04,0.0,0.05,0.09,0.01,0.07,...,0.03,0.05,0.01,0.17,0.0,0.02,0.01,0.01,0.01,0.12
-4JDJeFS0YAYSiSvIshGLQ,0.01,0.0,0.0,0.01,0.01,0.0,0.01,0.02,0.05,0.02,...,0.0,0.01,0.04,0.08,0.0,0.01,0.0,0.0,0.04,0.04
-50XWnmQGqBgEI-9ANvLlg,0.11,0.02,0.01,0.09,0.07,0.01,0.1,0.13,0.06,0.12,...,0.1,0.06,0.06,0.84,0.0,0.06,0.03,0.01,0.06,0.24


In [117]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-BS4aZAQm9u41YnB9MUASA,-Bf8BQ3yMk8U2f45r2DRKw,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--ZNfWKj1VyVElRx6-g1fg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
-05XqtNjcBq19vh2CVJN8g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-2gOxVWcnBr5DclrrsWXCA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-4JDJeFS0YAYSiSvIshGLQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-50XWnmQGqBgEI-9ANvLlg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
# Top 10 restaurants in genre 0
top_10 = H.iloc[0].sort_values(ascending=False).index[:10]
top_10

Index(['zRqi6L1u-YmmVAHjeUbGMQ', 'uW6UHfONAmm8QttPkbMewQ',
       'q3oJ6bNRV3OoJrwc95GOwg', 'hIUKufhwR6Ifn7bi0-phLA',
       '4mb32UmQULqg7IMck28vog', 'V-0qRzBHKixmQgon_fW_AA',
       'SVGApDPNdpFlEjwRQThCxA', 'BhueLLvA0k9G1Lr0WeZX9w',
       'mDR12Hafvr84ctpsV6YLag', 'NIOwzgujIXKVBEVNTQBXpg'],
      dtype='object', name='business_id')

In [123]:
[df_filter.name[df_filter.business_id == i].unique()
 for i in top_10]

[array(['SkinnyFATS'], dtype=object),
 array(['Island Flavor'], dtype=object),
 array(['Dirt Dog'], dtype=object),
 array(['Baguette Cafe'], dtype=object),
 array(['Pinches Tacos'], dtype=object),
 array(['Makai Pacific Island Grill'], dtype=object),
 array(["Juan's Flaming Fajitas & Cantina"], dtype=object),
 array(['Cafe Zupas'], dtype=object),
 array(['Mr Mamas'], dtype=object),
 array(['Jjanga Steak & Sushi'], dtype=object)]