## Simple Correlation based recommender system

In [0]:
import pandas as pd
import numpy as np

In [0]:
cuisine = pd.read_csv('/content/chefmozcuisine.csv')
ratings = pd.read_csv('/content/rating_final.csv')


In [0]:
places = pd.read_csv('/content/geoplaces2.csv',   encoding='latin-1')

In [49]:
places.head()

Unnamed: 0,placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,zip,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,?,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
1,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,78280,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
2,135106,22.149709,-100.976093,0101000020957F0000649D6F21634858C119AE9BF528A3...,El Rincón de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,78000,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
3,132667,23.752697,-99.163359,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,?,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
4,132613,23.752903,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E...,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,?,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none


In [0]:
# We are trying to recommend the restaurant based on User rating therefore the only useful column from this dataframe would be 'name'

In [51]:
cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


In [52]:
ratings.head()

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2


In [0]:
#keeping only the columns we need for the recommender
place_name = places[['placeID','name']]

In [60]:
place_name.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rincón de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


In [68]:
#grouping the places based on their average ratings

place_ratings = pd.DataFrame(ratings.groupby('placeID')['rating'].mean())
place_ratings.head()

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,0.5
132561,0.75
132564,1.25
132572,1.0
132583,1.0


In [69]:
#Since the average rating alone won't suffice, we need to consider the number of ratings each place received

place_ratings['rating count'] = pd.DataFrame(ratings.groupby('placeID')['rating'].count())

place_ratings.head()


Unnamed: 0_level_0,rating,rating count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.5,4
132561,0.75,4
132564,1.25,4
132572,1.0,15
132583,1.0,4


In [70]:
place_ratings.describe()

Unnamed: 0,rating,rating count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [0]:
#We can see that in our dataframe each place has a minimum of 3 ratings and the most popular place has 36 review ratings

In [0]:
#We can also see that maximum rating is 2. therefore each place is rated on a scale of 0 to 2

In [74]:
#Reordering based on the rating count

place_ratings.sort_values('rating count', ascending=False).head()

Unnamed: 0_level_0,rating,rating count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.28125,32
135032,1.178571,28
135052,1.28,25
132834,1.0,25


In [75]:
#Now we find the name of the place 

place_name[place_name['placeID'] == 135085]

Unnamed: 0,placeID,name
121,135085,Tortas Locas Hipocampo


In [0]:
#Tortas Locas Hipocampo is the most popular place on our dataset

In [77]:
#Finding the type of food this restaurant serves

cuisine[cuisine['placeID'] == 135085]

Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


In [0]:
#Tortas Locas Hipocampo serves Fast food

In [81]:
#Now we try to cross-reference the users to each place 
#this would help to recommend places based on similar user ratings 

place_crosstab = pd.pivot_table(data=ratings, values='rating', index='userID', columns='placeID')

place_crosstab.head()

placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,132626,132630,132654,132660,132663,132665,132667,132668,132706,132715,132717,132723,132732,132733,132740,132754,132755,132766,132767,132768,132773,132825,132830,132834,132845,132846,132847,132851,132854,132856,...,135044,135045,135046,135047,135048,135049,135050,135051,135052,135053,135054,135055,135057,135058,135059,135060,135062,135063,135064,135065,135066,135069,135070,135071,135072,135073,135074,135075,135076,135079,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
U1001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,1.0,,,,,,,,...,,1.0,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,
U1002,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,1.0,,,,,,1.0,,1.0,,,,,,,,,,,,,,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,,,,,,,,,,,,2.0,,,,2.0,2.0,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,2.0,,,,0.0,,,,,,,,,2.0,,2.0,2.0,,,,,,,,,
U1004,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,,,,,2.0,,
U1005,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,...,,,,,,,1.0,,,,,,1.0,,,,,,,,2.0,,,,,,,,2.0,,,,,,,,,,,


In [0]:
#We see plenty of Null value becase not all the customer review each restaurant they go. Moreover the customer may review only few of the restaurants

In [83]:
#Now we try to find the people who rated Tortas Locas Hipocampo the most popular place

tlh_ratings = place_crosstab[135085]
tlh_ratings[tlh_ratings>=0]

userID
U1001    0.0
U1002    1.0
U1007    1.0
U1013    1.0
U1016    2.0
U1027    1.0
U1029    1.0
U1032    1.0
U1033    2.0
U1036    2.0
U1045    2.0
U1046    1.0
U1049    0.0
U1056    2.0
U1059    2.0
U1062    0.0
U1077    2.0
U1081    1.0
U1084    2.0
U1086    2.0
U1089    1.0
U1090    2.0
U1092    0.0
U1098    1.0
U1104    2.0
U1106    2.0
U1108    1.0
U1109    2.0
U1113    1.0
U1116    2.0
U1120    0.0
U1122    2.0
U1132    2.0
U1134    2.0
U1135    0.0
U1137    2.0
Name: 135085, dtype: float64

In [0]:
#Therefore these are the users and their respective ratings to tortas

In [88]:
#Now we find the places which are correlated with Tortas

similar_tortas = place_crosstab.corrwith(tlh_ratings)

tortas_corr = pd.DataFrame(similar_tortas, columns=['PearsonR'])
tortas_corr.dropna(inplace=True)
tortas_corr.sort_values('PearsonR' ,ascending=False)

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
135076,1.0
132922,1.0
135085,1.0
132937,1.0
132925,1.0
135066,1.0
135053,1.0
132754,0.930261
135045,0.912871
135062,0.898933


In [98]:
#Now we have our correlation results but we have no idea about the number of ratings on which they are correlated

tortas_corr_final = tortas_corr.join(place_ratings['rating count'])
tortas_corr_final[tortas_corr_final['rating count']>10].sort_values('PearsonR', ascending = False).head(10)

Unnamed: 0_level_0,PearsonR,rating count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.0,36
135076,1.0,13
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [0]:
#Now we can see that some of the value have 1.0 but these are not useful because they mean that 
#they have only one user ratings common
#and we cannot make recommendations based on one set of ratings

In [101]:
places_corr_tortas = pd.DataFrame([135085, 132754,135045,135062,135028,135042,135046,132872], index=np.arange(8), columns=['placeID'])

summary = pd.merge(places_corr_tortas, cuisine, on='placeID')
summary

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132754,Mexican
2,135028,Mexican
3,135042,Chinese
4,135046,Fast_Food
5,132872,American


In [104]:
#Now we can see that we have another restaurant '135046' which makes fast_food 
#This ended up highly correlated but before we jump into conclusions lets see how many types of cuisine are there

cuisine['Rcuisine'].describe()

count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object

In [0]:
#Now we can see that there are around 59 unique cusuine in our data- mexican being the most popular
#but we still have another fast food restaurant highly correlated 

In [106]:
#Finding the name of the correlated place for recommendation

place_name[place_name['placeID'] == 135046]

Unnamed: 0,placeID,name
42,135046,Restaurante El Reyecito


In [0]:
#Therefore we can safely recommend 'Restaurante El Reyecito' to users who liked 'Tortas Locas Hipocampo'