# Restaurant recommendation engine


### Aquiring, cleaning and properly formatting the data

In [3]:
# We get the data of the different boroughs of santo domingo directly from wikipedia
import pandas as pd

tables = pd.read_html(r"https://en.wikipedia.org/wiki/Distrito_Nacional")

boroughs = tables[1]
boroughs.drop(0, 0, inplace = True)
boroughs.columns = ['sector', 'population']
boroughs = boroughs.reset_index(drop=True)
boroughs.head()

Unnamed: 0,sector,population
0,Altos de Arroyo Hondo,27692
1,Arroyo Manzano,19151
2,Atala,17617
3,Bella Vista,28253
4,Buenos Aires-Independencia,14759


In [6]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="restaurant recommendation engine")
lat = []
lon = []

for index, row in boroughs.iterrows():
    location = geolocator.geocode(row['sector']+', Santo Domingo, Dominican Republic')
    # print(location)
    lat.append(getattr(location, 'latitude', None))
    lon.append(getattr(location, 'longitude', None))

boroughs['lat'] = lat
boroughs['lon'] = lon
boroughs.head()

Unnamed: 0,sector,population,lat,lon
0,Altos de Arroyo Hondo,27692,18.495855,-69.974084
1,Arroyo Manzano,19151,18.516797,-69.971034
2,Atala,17617,,
3,Bella Vista,28253,18.452205,-69.948472
4,Buenos Aires-Independencia,14759,18.715407,-69.967375


In [7]:
import folium

location_unknown = []

def printBoroughsMap(df):
    folium_map= folium.Map(location=[df.loc[46]['lat'], df.loc[46]['lon']],
                            zoom_start=12,
                            tiles="CartoDB dark_matter")

    for index, row in df.iterrows():
        try:
            popup_text = """{}<br>
                    Population: {}<br>
                    id: {}<br>"""
            popup_text = popup_text.format(row["sector"], row["population"], index)
            marker = folium.CircleMarker(location=[df.loc[index]['lat'], df.loc[index]['lon']],
                                        popup=popup_text)
            marker.add_to(folium_map)

        except ValueError:
            location_unknown.append(row)
        
    return folium_map

printBoroughsMap(boroughs)

In [8]:
# based on map inspection, and non existance in our dataset
# We need to find out by other means the lat,lon of the following locations

location_unknown.extend([boroughs.loc[4], boroughs.loc[61], boroughs.loc[51] ])
to_find = pd.DataFrame(location_unknown)
to_find.sort_index(inplace= True)

boroughs.loc[boroughs.sector == 'Buenos Aires-Independencia', ['lat','lon']] = [None, None]
boroughs.loc[boroughs.sector == 'Paraíso', ['lat','lon']] = [None, None]
boroughs.loc[boroughs.sector == 'Simón Bolívar', ['lat','lon']] = [None, None]

to_find

Unnamed: 0,sector,population,lat,lon
2,Atala,17617,,
4,Buenos Aires-Independencia,14759,18.715407,-69.967375
7,Centro Olímpico Duarte,36,,
13,Domingo Sabio,7359,,
15,Ensanche Capotillo,27613,,
24,Honduras del Oeste,9679,,
26,Jardín Zoológico,177,,
48,Nuestra Señora de la Paz,8729,,
51,Paraíso,26021,18.534375,-69.838334
52,Paseo de los Indios,6697,,


In [9]:
# Atala location based on goggle maps
to_find.loc[to_find.sector == 'Atala', ['lat','lon']] = [18.4451254, -69.944063]

to_find.loc[to_find.sector == 'Centro Olímpico Duarte', ['lat','lon']] = [18.4775096,-69.9200001]
to_find.loc[to_find.sector == 'Buenos Aires-Independencia', ['lat','lon']] = [18.4311477,-69.9724999]
to_find.loc[to_find.sector == 'Domingo Sabio', ['lat','lon']] = [18.4971503,-69.8882764]
to_find.loc[to_find.sector == 'Ensanche Capotillo', ['lat','lon']] = [18.5062898,-69.9070895]
to_find.loc[to_find.sector == 'Honduras del Oeste', ['lat','lon']] = [18.4275731,-69.9854261]
to_find.loc[to_find.sector == 'Nuestra Señora de la Paz', ['lat','lon']] = [18.4519672,-69.9332128]
to_find.loc[to_find.sector == 'Paraíso', ['lat','lon']] = [18.4798277,-69.9439645]
to_find.loc[to_find.sector == 'Paseo de los Indios', ['lat','lon']] = [18.4610866,-69.9632593]
to_find.loc[to_find.sector == 'Simón Bolívar', ['lat','lon']] = [18.5098635,-69.8972511]
to_find.loc[to_find.sector == 'Treinta de Mayo', ['lat','lon']] = [18.4409933,-69.9396408]
to_find.loc[to_find.sector == 'Tropical Metaldom', ['lat','lon']] = [18.4376854,-69.9509221]
to_find.loc[to_find.sector == 'Veinticuatro de Abril', ['lat','lon']] = [18.5064577,-69.8956526]


to_find

Unnamed: 0,sector,population,lat,lon
2,Atala,17617,18.445125,-69.944063
4,Buenos Aires-Independencia,14759,18.431148,-69.9725
7,Centro Olímpico Duarte,36,18.47751,-69.92
13,Domingo Sabio,7359,18.49715,-69.888276
15,Ensanche Capotillo,27613,18.50629,-69.907089
24,Honduras del Oeste,9679,18.427573,-69.985426
26,Jardín Zoológico,177,,
48,Nuestra Señora de la Paz,8729,18.451967,-69.933213
51,Paraíso,26021,18.479828,-69.943965
52,Paseo de los Indios,6697,18.461087,-69.963259


In [10]:
clean_borough = boroughs.combine_first(to_find)
clean_borough.head()

Unnamed: 0,sector,population,lat,lon
0,Altos de Arroyo Hondo,27692,18.495855,-69.974084
1,Arroyo Manzano,19151,18.516797,-69.971034
2,Atala,17617,18.445125,-69.944063
3,Bella Vista,28253,18.452205,-69.948472
4,Buenos Aires-Independencia,14759,18.431148,-69.9725


In [11]:
clean_map = printBoroughsMap(clean_borough)
clean_map

In [12]:
import requests

CLIENT_ID = 'sadasdasd' # your Foursquare ID
CLIENT_SECRET = 'asdasdsa' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0FNUJY2SLQGHJ22TBKIQSQHY0KJOJ2GNZ1ZTDJC3PAU0LLFL
CLIENT_SECRET:134APLWJHIXZTSOFCMNUQCRK1LBZC055ED34FQBA5PPOUG0R


In [13]:


radius = 500
LIMIT = 100

venues_list = []

for index, row in clean_borough.iterrows():
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        clean_borough.loc[index]['lat'],
        clean_borough.loc[index]['lon'],
        radius, 
        LIMIT)
    # print(url)
    try:
        response = requests.get(url).json()["response"]
        results = response['groups'][0]['items']
        for venue in results:
            venues_list.append([
                venue['venue']['name'],
                venue['venue']['categories'][0]['name'],
                clean_borough.loc[index]['sector'],
                index,
                venue['venue']['location']['lat'], 
                venue['venue']['location']['lng']])
    except KeyError:
        print(response, index, clean_borough.loc[index]['sector'])

venues = pd.DataFrame(venues_list)
venues.columns = ['name', 'categories', 'sector', 'sector_index', 'lat', 'lon']
venues.head()
#places = pd.DataFrame(venues)
#places.head()


{} 26 Jardín Zoológico


Unnamed: 0,name,categories,sector,sector_index,lat,lon
0,Pradera Hermosa,Park,Altos de Arroyo Hondo,0,18.492552,-69.974307
1,Guacara Taina,Nightclub,Atala,2,18.445841,-69.945591
2,Av. Anacaona,Outdoors & Recreation,Atala,2,18.447205,-69.94602
3,Jet Set Club,Nightclub,Atala,2,18.442556,-69.942411
4,Cifré Clínica de Estética,Health & Beauty Service,Atala,2,18.442468,-69.941881


In [14]:
venues.size

7416

In [24]:
from random import randint

colors = []

for i in range(100):
    colors.append('%06X' % randint(0, 0xFFFFFF))

def insertVenues(df, folium_map):
    for index, row in df.iterrows():
        try:
            popup_text = """{}<br>
                    Category: {}<br>
                    id: {}<br>"""
            popup_text = popup_text.format(row["name"], row["categories"], index)
            marker = folium.CircleMarker(location=[df.loc[index]['lat'], df.loc[index]['lon']],
                                        radius='3',
                                        fill=True,
                                        fill_color='#'+colors[df.loc[index]['sector_index']],
                                        fill_opacity=0.7,
                                        color='#'+colors[df.loc[index]['sector_index']])
            marker.add_to(folium_map)
            
        except ValueError:
            print(row)
        
    return folium_map

mapa = printBoroughsMap(clean_borough)
insertVenues(venues, mapa)

mapa

In [17]:
# Top 50 more recurrent venues in the city

venues_by_cat = venues.groupby(['categories']).size().sort_values(ascending=False)
venues_by_cat[0:50]

categories
Bar                              45
BBQ Joint                        38
Pizza Place                      36
Ice Cream Shop                   35
Bank                             34
Restaurant                       31
Sandwich Place                   30
Pharmacy                         29
Bakery                           25
Fast Food Restaurant             24
Burger Joint                     24
Food Truck                       23
Supermarket                      23
Lounge                           20
Chinese Restaurant               20
Italian Restaurant               19
Nightclub                        19
Spanish Restaurant               18
Park                             18
Beer Garden                      17
Caribbean Restaurant             17
Café                             16
Gym                              15
Department Store                 14
Hotel                            14
Liquor Store                     13
Steakhouse                       13
Shopping Mall    

In [18]:
restaurant_alias = ['Restaurant', 'Food', 'Burguer', 'Sandwich', 'Coffee Shop', 'Bistro', 
                    'Pie', 'Dessert', 'Bakery', 'Donut', 'Diner', 'Buffet',
                    'Steakhouse', 'Gastropub', 'BBQ', 'Pizza', 'Ice Cream', 'Wings', 'Breakfast']

restaurant_cat = set()
all_cat= set()

restaurant_total = 0
total = 0

def isAMatch(string, arr):
    for ele in arr:
        if ele in string:
            return True
    return False

for category, size in venues_by_cat.items():
    # print(category, size)
    if isAMatch(category, restaurant_alias):
        restaurant_total += size
        restaurant_cat.add(category)
    total += size
    all_cat.add(category)

print('total', total)
print('Restaurant total', restaurant_total)
print('Food joints are a', str(round((restaurant_total/total)*100,2)) + '%',
      'of all venues in Santo Domingo avaiable in the foursquare api')

total 1236
Restaurant total 524
Food joints are a 42.39% of all venues in Santo Domingo avaiable in the foursquare api


In [19]:
print('All food joint categories', restaurant_cat)
print('Total ammount food joint categories:', len(restaurant_cat))

print('Total ammount of categories:', len(all_cat))

non_food_cat = all_cat.difference(restaurant_cat)


All food joint categories {'Asian Restaurant', 'Theme Restaurant', 'Falafel Restaurant', 'Caribbean Restaurant', 'Dessert Shop', 'BBQ Joint', 'Bed & Breakfast', 'French Restaurant', 'Arepa Restaurant', 'Breakfast Spot', 'Pier', 'Korean Restaurant', 'Diner', 'Japanese Restaurant', 'Tapas Restaurant', 'Middle Eastern Restaurant', 'Pie Shop', 'Italian Restaurant', 'Comfort Food Restaurant', 'Empanada Restaurant', 'Wings Joint', 'Ice Cream Shop', 'Restaurant', 'Gastropub', 'Peruvian Restaurant', 'Bakery', 'Brazilian Restaurant', 'Seafood Restaurant', 'Bistro', 'Indian Restaurant', 'Food', 'Mexican Restaurant', 'Chinese Restaurant', 'Mediterranean Restaurant', 'Argentinian Restaurant', 'Food & Drink Shop', 'Steakhouse', 'Paella Restaurant', 'Buffet', 'Dim Sum Restaurant', 'Coffee Shop', 'Food Truck', 'German Restaurant', 'Fast Food Restaurant', 'Sushi Restaurant', 'South American Restaurant', 'Latin American Restaurant', 'Sandwich Place', 'Pizza Place', 'American Restaurant', 'Donut Shop', 

In [20]:
venue_preferences_columns = ['sector', 'sector_id']
venue_preferences_columns.extend(list(all_cat))

types = {'sector':'str'}


venue_preferences = pd.DataFrame(columns=venue_preferences_columns)
venue_preferences

# for index, venue in venues.iterrows():

#venues.head()

#ser = venues.groupby(['sector_index', 'categories']).size()
#ser

group = venues.groupby(['sector_index', 'categories']).size()
#for sector_index, categories in group.items():
    #print(sector_index,categories)
    #print('wut??')
#= ['sector_index', 'categories', 'total']
#group = group.to_frame()
#group.columns.values
# print(types)

group = group.to_frame().unstack()
group.fillna(0, inplace=True)
group.head()

Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
categories,Accessories Store,American Restaurant,Aquarium,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Veterinarian,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
sector_index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
normalized_df=(group-group.min())/(group.max()-group.min())

normalized_df

Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
categories,Accessories Store,American Restaurant,Aquarium,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Veterinarian,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
sector_index,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.5,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [25]:
from sklearn.cluster import KMeans


In [73]:
kmeans = KMeans(n_clusters=7, random_state=0).fit(normalized_df)


In [74]:
kmeans.labels_[0:10]

array([0, 1, 1, 0, 0, 0, 1, 0, 5, 4])

In [75]:
def labeledMap(df, labels):
    folium_map= folium.Map(location=[df.loc[46]['lat'], df.loc[46]['lon']],
                            zoom_start=12,
                            tiles="CartoDB dark_matter")
    for index, row in df.iterrows():
        try:
            popup_text = """{}<br>
                    Category: {}<br>
                    id: {}<br>"""
            popup_text = popup_text.format(row["name"], row["categories"], index)
            marker = folium.CircleMarker(location=[df.loc[index]['lat'], df.loc[index]['lon']],
                                        radius='3',
                                        fill=True,
                                        fill_color='#'+colors[labels[df.loc[index]['sector_index']]],
                                        fill_opacity=0.7,
                                        color='#'+colors[labels[df.loc[index]['sector_index']]])
            marker.add_to(folium_map)
            
        except IndexError:
            print(row)
        
    return folium_map


In [76]:
finalMap = labeledMap(venues, kmeans.labels_)

name            Encajes La Rosario
categories               Gift Shop
sector             Villa Francisca
sector_index                    67
lat                        18.4824
lon                       -69.8898
Name: 1053, dtype: object
name               Barrio Chino
categories         Neighborhood
sector          Villa Francisca
sector_index                 67
lat                     18.4792
lon                    -69.8886
Name: 1054, dtype: object
name            Restaurante El Dragon
categories         Chinese Restaurant
sector                Villa Francisca
sector_index                       67
lat                           18.4789
lon                          -69.8885
Name: 1055, dtype: object
name            Restaurante Delicia Campestre
categories                 Chinese Restaurant
sector                        Villa Francisca
sector_index                               67
lat                                   18.4789
lon                                  -69.8875
Name: 1056, dtyp

In [77]:
finalMap

In [57]:
clean_borough.shape

(71, 4)

In [58]:
venues.shape

(1236, 6)