In [50]:
import sqlite3
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

# Load data

In [24]:
conn = sqlite3.connect('yelp_dataset_reviews.db') 
df_reviews= pd.read_sql("""SELECT * FROM reviews""",conn )
conn.close()

In [14]:
pd.set_option('display.max_colwidth', None)

In [2]:
conn = sqlite3.connect('yelp_dataset_business.db') 
df_business= pd.read_sql("""SELECT * FROM business""",conn )
conn.close()

In [3]:
df_business.is_open = df_business.is_open.astype(int)
df_business["review_count"]= df_business["review_count"].astype(int)
df_business['attributes']= df_business['attributes'].apply(lambda x: eval(x) if x else None)
df_business['hours']= df_business['hours'].apply(lambda x: eval(x) if x else None)

In [19]:
conn = sqlite3.connect('yelp_dataset_users.db') 
df_users= pd.read_sql("""SELECT user_id, name FROM users""",conn )
conn.close()

In [4]:
attr = [col for col in df_business.attributes.explode().unique() if col is not None]
lst_of_attr_dict = []
for attr_dict in df_business.attributes:
    if not attr_dict:
        lst_of_attr_dict.append({})
        continue

    if 'BusinessParking' in attr_dict:
        if type(attr_dict['BusinessParking']) == str:
            attr_dict['BusinessParking'] = ('True' in attr_dict['BusinessParking'])

    lst_of_attr_dict.append(attr_dict)
attr_df = pd.DataFrame(lst_of_attr_dict, columns=attr)

for col in attr_df:
        attr_df[col] = attr_df[col].fillna(False).astype(bool)
    
df_business = pd.concat([df_business.reset_index().drop('index', axis=1), attr_df], axis=1)
df_business.drop(['attributes'], axis=1, inplace=True)
lst_of_time = []
for time_dict in df_business.hours:
    if not time_dict:
        lst_of_time.append({})
        continue
    lst_of_time.append(time_dict)
time_df = pd.DataFrame(lst_of_time)
df_business = pd.concat([df_business, time_df], axis=1).drop('hours', axis=1)

In [36]:
df_reviews['date'] = pd.to_datetime(df_reviews['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reviews['date'] = pd.to_datetime(df_reviews['date'])


# Feature Engineering

In [6]:
df_business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'ByAppointmentOnly', 'BusinessAcceptsCreditCards',
       'BikeParking', 'RestaurantsPriceRange2', 'CoatCheck',
       'RestaurantsTakeOut', 'RestaurantsDelivery', 'Caters', 'WiFi',
       'BusinessParking', 'WheelchairAccessible', 'HappyHour',
       'OutdoorSeating', 'HasTV', 'RestaurantsReservations', 'DogsAllowed',
       'Alcohol', 'GoodForKids', 'RestaurantsAttire', 'Ambience',
       'RestaurantsTableService', 'RestaurantsGoodForGroups', 'DriveThru',
       'NoiseLevel', 'GoodForMeal', 'BusinessAcceptsBitcoin', 'Smoking',
       'Music', 'GoodForDancing', 'AcceptsInsurance', 'BestNights', 'BYOB',
       'Corkage', 'BYOBCorkage', 'HairSpecializesIn', 'Open24Hours',
       'RestaurantsCounterService', 'AgesAllowed', 'DietaryRestrictions',
       'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'S

In [7]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,RestaurantsCounterService,AgesAllowed,DietaryRestrictions,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,False,False,False,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,...,False,False,False,0:0-0:0,8:0-18:30,8:0-18:30,8:0-18:30,8:0-18:30,8:0-14:0,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,...,False,False,False,8:0-22:0,8:0-22:0,8:0-22:0,8:0-22:0,8:0-23:0,8:0-23:0,8:0-22:0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,False,False,False,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0,7:0-21:0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,False,False,False,,,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0,12:0-18:0


In [13]:
df_business['Monday'].value_counts()

0:0-0:0        31362
8:0-17:0        4202
9:0-17:0        3910
11:0-22:0       3241
11:0-21:0       2938
               ...  
19:30-22:0         1
16:45-20:15        1
22:0-18:0          1
21:30-0:0          1
20:0-4:0           1
Name: Monday, Length: 1315, dtype: int64

In [11]:
df_business = df_business.replace(False, 0)
df_business = df_business.replace(True, 1)

In [16]:
df_business = df_business.drop(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],axis=1)

In [17]:
df_business

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,AcceptsInsurance,BestNights,BYOB,Corkage,BYOBCorkage,HairSpecializesIn,Open24Hours,RestaurantsCounterService,AgesAllowed,DietaryRestrictions
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,0,0,0,0,0,0,0,0,0,0
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,...,0,0,0,0,0,0,0,0,0,0
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,...,0,0,0,0,0,0,0,0,0,0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,0,0,0,0,0,0,0,0,0,0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,...,0,0,0,0,0,0,0,0,0,0
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,...,0,0,0,0,0,0,0,0,0,0
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,...,0,0,0,0,0,0,0,0,0,0
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,...,0,0,0,0,0,0,0,0,0,0


## user based Collaborative Filtering


In [37]:
df_users
selected= ['review_id', 'user_id', 'business_id', 'stars','date']
df_reviews = df_reviews[selected]

In [38]:
df_reviews = df_reviews[df_reviews.stars>4]

In [39]:
dis_rate = df_reviews.iloc[:10000,:]

In [40]:
idx = dis_rate.groupby(['user_id','business_id'])['date'].idxmax()

In [41]:
dis_rate = dis_rate.loc[idx]

In [42]:
dis_rate

Unnamed: 0,review_id,user_id,business_id,stars,date
22278,xJuVVh0wspQlCPgTcbbiIg,---2PmXbF47D870stH1jqA,hKameFsaXh9g8WQbv593UA,5.0,2014-10-28 14:38:58
5247,6mKyy0a9PnMHALlPGrpLIw,--_r6E98SNIrGU7weyNxbw,EC2huvu74EMjrpWdEizbmw,5.0,2018-01-21 17:23:23
6978,QeneleeT82C_pB7SlaYJtQ,--pvE2eu3WWwikKs1E2QDw,EP2jFD3aGoSBCWb7irY5-w,5.0,2015-05-13 00:31:23
11108,FCwcqqqSwDhvc3uZB4rL2Q,-0-ufVVBqB64vpLtoFQZrA,kZ3L75t_7EqE9kRS6bpWSA,5.0,2011-02-14 05:43:07
20145,Tirdt42lWUNQDOCY_LAwiw,-0H6Rm6dCi3pkFBC26HsoQ,HoXdBq2Puaj69d8z6lgiVw,5.0,2015-11-07 03:34:11
...,...,...,...,...,...
21958,gPSH-OapRTn10rotY5MyeA,zxuxd6Hz2tKcpgZ71dYEcw,N-ej51lLtIl4TMC-Qb3k4A,5.0,2014-04-02 03:05:37
20029,Q1GnUTgTEVdASFuI2BDz3w,zy8vpv-D0tSwtA4wXa_t7A,Bd9AyP3ZpE57Ed0gHwh6NA,5.0,2016-08-13 13:42:10
7777,Y2sHXw8zLGb6rvOeRWHiVA,zyS9t2Ceoru_Rt5AK2DlcA,-zsvmEbkd-K9K2DAAKqiEQ,5.0,2016-06-11 21:08:10
17791,guMOuznG90cVq0HqL2MapA,zyvxtbh5eJ86bVgk52Yflg,p7UGlPkE3Gagm9Mq7aHfYg,5.0,2013-10-25 15:51:28


In [44]:
piv_rate= dis_rate.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
piv_rate

business_id,--ZVrH2X2QXBFdCilbirsw,-02xFuruu85XmDn2xiynJw,-0Ym1Wg3bXd_TDz8JtvOQg,-1MhPXk1FglglUAmuPLIGg,-1ueCbvIpUPi8KT95ETTKw,-3AooxIkg38UyUdlz5oXdw,-5jrwZnndGs9q3akdkJYJA,-6OjnX3ZdDOhHxWR60wysg,-7GDqSUaXrpC8Ql7nDBxWA,-7GjicSH_rM8JeZGCXGcUg,...,znTKlh4x8NoBIojm4Yo5hA,znsHfZ2seiCAiO8NGQm8-Q,zrd6bulYNAKhCdSVtJ8aXQ,zsDjnBHyJsYfkdry96x75g,zu4p6IZLSVn2Noto-vcwzw,zun6IVJa7wYe3wAPqWnPGw,zwTmOj4B_OVPMTMYijQiKg,zwXT4m9svdg-xh2cKjEp-Q,zxuVnNVAUm16Sro0dS_lmg,zyKTy79BwgCLdUGBiuYPxA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--_r6E98SNIrGU7weyNxbw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--pvE2eu3WWwikKs1E2QDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0-ufVVBqB64vpLtoFQZrA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0H6Rm6dCi3pkFBC26HsoQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxuxd6Hz2tKcpgZ71dYEcw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zy8vpv-D0tSwtA4wXa_t7A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyS9t2Ceoru_Rt5AK2DlcA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyvxtbh5eJ86bVgk52Yflg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
bus_id = piv_rate.columns
Names = [df_business[df_business.business_id == id_]['name'].iloc[0] for id_ in bus_id] # extract movie titles
piv_rate.columns = Names
counts = piv_rate.apply(np.count_nonzero, axis=0) # count the number of ratings for each movie
number_ratings_df = pd.DataFrame(counts).reset_index()
number_ratings_df.columns = ['bus_name', 'numberRatings']
# get the names of the 500 movies with the most ratings in the df
top = number_ratings_df.sort_values('numberRatings', ascending=False)[0:500].bus_name

In [47]:
ratings = piv_rate[list(top)]
ratings

Unnamed: 0_level_0,Luke,District Donuts Sliders Brew,Peppermill Reno,Surrey's Café & Juice Bar,Mr. B's Bistro,The Pancake Pantry,Prep & Pastry,Blues City Deli,HipCityVeg,Bacchanal Fine Wine & Spirits,...,Indiana City Brewing,M.L.Rose Craft Beer & Burgers - Melrose,Philadelphia Phillies,Desert Dream Ice Cream,Ole Red,Total Wine & More,Ballyhoo Grill,Fireside Lounge,Zea Rotisserie and Grill,Cafe Square One
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--_r6E98SNIrGU7weyNxbw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--pvE2eu3WWwikKs1E2QDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0-ufVVBqB64vpLtoFQZrA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0H6Rm6dCi3pkFBC26HsoQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxuxd6Hz2tKcpgZ71dYEcw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zy8vpv-D0tSwtA4wXa_t7A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyS9t2Ceoru_Rt5AK2DlcA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyvxtbh5eJ86bVgk52Yflg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
svd_model = TruncatedSVD(n_components=50, random_state=671)
users_df_svd = svd_model.fit_transform(ratings)

U = users_df_svd / svd_model.singular_values_ 
sigma = svd_model.singular_values_
V = svd_model.components_

# checks 
print("Shape of U:", U.shape)
print("Length of Sigma:", len(sigma))
print("Shape of V:", V.shape)

Shape of U: (9646, 50)
Length of Sigma: 50
Shape of V: (50, 530)


In [51]:

clusters = KMeans(n_clusters=5, random_state=6).fit_predict(U) # arbitrary choice of k
clustered_users_df = pd.DataFrame(U, index = piv_rate.index)
clustered_users_df['cluster'] = clusters # create new column with cluster membership
print(clustered_users_df.shape) # check
clustered_users_df.head()

(9646, 51)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
--_r6E98SNIrGU7weyNxbw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
--pvE2eu3WWwikKs1E2QDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
-0-ufVVBqB64vpLtoFQZrA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
-0H6Rm6dCi3pkFBC26HsoQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [53]:
user = input("Enter a user id: ")

cluster_no = int(clustered_users_df[clustered_users_df.index == user].cluster) # get the user's cluster
print("User", user, "belongs to cluster", cluster_no)
similar_users = list(clustered_users_df[clustered_users_df.cluster == cluster_no].index) # find other users in the cluster
print("Number of similar users:", len(similar_users))
print("Similar users:", similar_users)

User ---2PmXbF47D870stH1jqA belongs to cluster 0
Number of similar users: 9536
Similar users: ['---2PmXbF47D870stH1jqA', '--_r6E98SNIrGU7weyNxbw', '--pvE2eu3WWwikKs1E2QDw', '-0-ufVVBqB64vpLtoFQZrA', '-0H6Rm6dCi3pkFBC26HsoQ', '-0_v1Gvjhsi0AbKmQLdq0g', '-0ouXH70JU4xkVw-A4Gs8w', '-1C-Lu0NTlYi4XB82ntKtg', '-1WbN1Qd-opw8u3uEqs2Kg', '-3QN6QV27xaeLz4qrWFOng', '-3v5r9Z0BllS55zvWgz3Fw', '-4Ot7PKITEGPven8mjGIog', '-4qgRjqx6fdCAeewTBEWbw', '-4sJjyuiDgTgUoW0RlhQtg', '-6Fk7QmNWJDaDKwwcgS8PQ', '-6GQDYZxSKX23WmXBpiJ5w', '-6MKzFSO3n9egKtXiCSngQ', '-6rFcyKGC-B7C89FzLyBeA', '-73QalI4iMTK0LvOhHEsVg', '-7ocx21qec8R9e2t8yO27A', '-AsIFz8wFYnV1C8TuiV7BQ', '-AymVok4pBNTZuSo19nNcA', '-BUc-4dOQD_fJoI411WfRA', '-BnZ6LZXIDrluYRM1hyTKA', '-BvuV_I0aWBeVcqTKbml3w', '-CgC_RfYdWEPUjUs1BlTTg', '-DMJL_fcLZCvlqzMm97giA', '-DSTIwizyfygwcZG4KmSxA', '-DY5Adv-j57Ve2HlERmoRA', '-Dt5o6GpQcXQfVeWpHNtDg', '-E6AHdTbPAFpJ63lkvuZrA', '-EoJZdxSKhowrsFMBAPE4Q', '-F6PdlUlQMGrsX-qug_f4g', '-FFgJuDZLmcZtrxDTRC7sg', '-FH9K-xkwdE25gH8lGGQ

In [54]:
cluster_df = ratings.loc[similar_users] # filter to users in cluster
cluster_df[cluster_df == 0] = np.nan # set 0 values to NaN (to not bias calculation of means)
avg_ratings = cluster_df.mean() # get the mean rating for each movie for the cluster
avg_ratings.sample(5) # preview a sample of the mean cluster ratings

Angry Chair Brewing                    5.0
Heads & Tails Beauty Boutique          5.0
Ghini's French Caffe                   5.0
Hyatt Regency St. Louis at the Arch    5.0
Watermark Restaurant                   5.0
dtype: float64