# Capstone project - location for new restaurants in Montreal

First let's import all required libraries and connect to foursquare.

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library


Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
# Connection to foursquare
CLIENT_ID = 'NMYVML2LZYNTFORMCNSE24I3EFOMFUB5RJOPY0SC5BSDTDNV' # your Foursquare ID
CLIENT_SECRET = 'NQD405VPACTG1UEFWFOVVLZ4FY0DU2YK5YYLKB2E2EQGKZPC' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: NMYVML2LZYNTFORMCNSE24I3EFOMFUB5RJOPY0SC5BSDTDNV
CLIENT_SECRET:NQD405VPACTG1UEFWFOVVLZ4FY0DU2YK5YYLKB2E2EQGKZPC


Using explore endpoint to get all venues that belong to "food" category. Let's start with 2km radius area around Montreal city center. 

In [3]:
# Find all daily lunch restaurant within 2km from Montreal city center

# search parameters
day = "Monday"
time = "12:00"
section= 'food'
LIMIT = 50
radius = 2000
latitude, longitude =  45.5017, -73.5673

# Get information from foursquare
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&section={}&limit={}&time={}&day={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, section, LIMIT, time, day)
results = requests.get(url).json()

# Itterate through all results, because of max 50 venues limit per request
n_results = results['response']['totalResults']
print('Number of found venues', n_results)
df = pd.DataFrame()
for i in range(0, n_results, 50):
    offset = i
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&section={}&limit={}&time={}&day={}&offset={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, section, LIMIT, time, day, offset)
    results_partial = requests.get(url).json()
    # Use only information about venues
    venues_partial = results_partial['response']['groups'][0]['items']
    # Create DataFrame from json
    #df_venues = json_normalize(venues_partial)
    df_venues = pd.DataFrame()
    df_venues['id'] = [x['venue']['id'] for x in venues_partial]
    df_venues['lat'] = [x['venue']['location']['lat'] for x in venues_partial]
    df_venues['lng'] = [x['venue']['location']['lng'] for x in venues_partial]
    df_venues['category'] = [x['venue']['categories'][0]['name'] for x in venues_partial]
    df = df.append(df_venues)


Number of found venues 248


In [4]:
print('Number of venues:', df.shape[0])
print('Columns:', df.columns)
df.head(3)


Number of venues: 248
Columns: Index(['id', 'lat', 'lng', 'category'], dtype='object')


Unnamed: 0,id,lat,lng,category
0,4b4e1626f964a5200ce126e3,45.504009,-73.568213,Pizza Place
1,550bd362498e0e9050efe954,45.503045,-73.567888,Taco Place
2,4b6a4262f964a520a1cf2be3,45.50073,-73.568971,Steakhouse


Let's investigate the found venues. To do that, we can represent the venues in the map as blue circles. To indicate the the radius of our search let's draw a red circle.

In [5]:
# Show in map
venues_map = folium.Map(location=[latitude, longitude], zoom_start=14)


for lat, lng, label in zip(df['lat'], df['lng'], df['id']):
    folium.features.CircleMarker(
        [lat, lng],
        radius=3,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

folium.features.Circle(
    [latitude, longitude],
    radius=2000,
    color='red',
    popup=label,
    opacity = 0.5,
    fill = False
).add_to(venues_map)
    
# display map
venues_map

Above we can see map with all possible places to get food within 2km radius. To understand which venues are more popular, we can look into likes recieved. To represent them, we can change the opacity of a marker for each venue based on normalized number of likes.

In [6]:
# Call GET https://api.foursquare.com/v2/venues/VENUE_ID for every venue, and extract needed information

df_venues = pd.DataFrame()

# Iterate through venues
for index, row in df.iterrows():
    venue_id = row['id']
    # call foursquere 'venues' endpoint to get all the information about the venue
    url = 'https://api.foursquare.com/v2/venues/{}/likes?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    venue_info = requests.get(url).json()
    #print(venue_info)
    # create df only with needed information for each venue
    info = venue_info['response']
    
    df_venue = pd.DataFrame()
    df_venue['id'] = [venue_id]
    df_venue['likes'] = [info['likes']['count']]
    df_venues = df_venues.append(df_venue)
df_venues.head(3)


Unnamed: 0,id,likes
0,4b4e1626f964a5200ce126e3,290
0,550bd362498e0e9050efe954,104
0,4b6a4262f964a520a1cf2be3,271


In [7]:
# Merge dataframes with restaurants info and likes
#df_ = df[['venue.id', 'venue.name', 'venue.location.lat', 'venue.location.lng']]
df_data = df_venues.merge(df, on='id')
df_data.head(3)

Unnamed: 0,id,likes,lat,lng,category
0,4b4e1626f964a5200ce126e3,290,45.504009,-73.568213,Pizza Place
1,550bd362498e0e9050efe954,104,45.503045,-73.567888,Taco Place
2,4b6a4262f964a520a1cf2be3,271,45.50073,-73.568971,Steakhouse


In [8]:
# Distribution of likes, apply log to normalize distribution
df_data['likes_log'] = np.log(df_data['likes'])
df_data['likes_log'] = (df_data['likes_log']-df_data['likes_log'].min())/(df_data['likes_log'].max()-df_data['likes_log'].min())
hist = df_data['likes_log'].hist(bins=100)
hist

<matplotlib.axes._subplots.AxesSubplot at 0x22e2cd38a90>

In [9]:
# Show in map
venues_map = folium.Map(location=[latitude, longitude], zoom_start=14)

df_data['color'] = ['red' if x < 0.33 else 'orange' if x < 0.66 else 'green' for x in df_data['likes_log']]

# add the Italian restaurants as blue circle markers
for lat, lng, label, col in zip(df_data['lat'], df_data['lng'], df_data['id'], df_data['color']):
    folium.features.Circle(
        [lat, lng],
        radius=20,
        color=col,
        popup=label,
        opacity=0.7
    ).add_to(venues_map)

folium.features.Circle(
    [latitude, longitude],
    radius=2000,
    color='red',
    popup=label,
    fill = False
).add_to(venues_map)
    
# display map
venues_map

Above, we can see the restaurants classified into 3 groups by their popularity. Next, let's build a classifier that will help us decide if given location is good for a new business.   

In [10]:
# For each restaurant, take 5 nearest restaurants and use their distance as features

def get_closest_rest_features(ll, df_data):
    X = []
    # Closest restaurants
    temp_df = pd.DataFrame()
    temp_df['proximity_lat'] = df_data['lat'] - ll[0]
    temp_df['proximity_lng'] = df_data['lng'] - ll[1]
    temp_df['proximity'] = temp_df['proximity_lat'].abs() + temp_df['proximity_lng'].abs()
    temp_df['id'] = df_data['id']
    temp_df = temp_df.sort_values(by=['proximity'], ascending=True)
    ids = temp_df[:5][['id', 'proximity']]# skip itself
    #print(ids)
    df_closest = ids.merge(df_data, on='id')
    X = X + [df_closest['proximity'].tolist() + df_closest['likes_log'].tolist()]
    return(X)



df_restaurant = df_data
X = []
for index, restaurant in df_restaurant.iterrows():
    
    # Closest restaurants
    temp_df = pd.DataFrame()
    temp_df['proximity_lat'] = df_data['lat'] - restaurant['lat']
    temp_df['proximity_lng'] = df_data['lng'] - restaurant['lng']
    temp_df['proximity'] = temp_df['proximity_lat'].abs() + temp_df['proximity_lng'].abs()
    temp_df['id'] = df_data['id']
    temp_df = temp_df.sort_values(by=['proximity'], ascending=True)
    ids = temp_df[1:6][['id', 'proximity']]# skip itself
    #print(ids)
    df_closest = ids.merge(df_data, on='id')
    X = X + [df_closest['proximity'].tolist() + df_closest['likes_log'].tolist()]

X = pd.DataFrame(X) 
print(X.columns)

#X[[0, 1, 2, 3, 4]] = (X[[0, 1, 2, 3, 4]]-X[[0, 1, 2, 3, 4]].min())/(X[[0, 1, 2, 3, 4]].max()-X[[0, 1, 2, 3, 4]].min())
#print(X.head())
y = df_restaurant['likes_log']

print(X.shape)
print(y.shape)
    

RangeIndex(start=0, stop=10, step=1)
(248, 10)
(248,)


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = LinearRegression()
clf = clf.fit(X, y)

In [12]:
# generate random coordinates and evaluate their likability
import random 

latitude, longitude =  45.5017, -73.5673
p_list = []
x = []
for i in range(5):
    lat_rand = (random.random()-0.5)/50 + latitude
    lng_rand = (random.random()-0.5)/50 + longitude
    p_list.append([lat_rand, lng_rand])
    rest_feat = get_closest_rest_features((lat_rand, lng_rand), df_data)
    
    x.append(rest_feat)

score = clf.predict(np.asarray(x).squeeze())
score

array([0.37307717, 0.41734054, 0.47021011, 0.32901485, 0.38644405])

In [13]:
# Show in map
venues_map = folium.Map(location=[latitude, longitude], zoom_start=14)# generate map centred around the Conrad Hotel

df_data['color'] = ['red' if x < 0.33 else 'orange' if x < 0.66 else 'green' for x in df_data['likes_log']]

# add the Italian restaurants as blue circle markers
for lat, lng, label, col in zip(df_data['lat'], df_data['lng'], df_data['id'], df_data['color']):
    folium.features.Circle(
        [lat, lng],
        radius=20,
        color=col,
        popup=label,
        opacity=0.7
    ).add_to(venues_map)
    
for i, p in enumerate(p_list):
    folium.features.Circle(
        p,
        radius=50,
        color='black',
        opacity=0.7,
        popup='{}'.format(score[i])
    ).add_to(venues_map)
    
folium.features.Circle(
    [latitude, longitude],
    radius=2000,
    color='red',
    popup=label,
    fill = False
).add_to(venues_map)
    
# display map
venues_map