## Neighborhoods in Toronto - Part 1

The code below import the libraries, scrape the data from Wikipedia and transform data in a Data Frame

In [1]:
import bs4 as bs
import urllib.request
import pandas as pd
import numpy as np

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source, 'lxml')
table = soup.find('table')
df = pd.read_html(str(table))
df2 = pd.DataFrame(df[0])
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In this code, the data with the Borough is "Not Assigned" is dropped, the data with the same Post Code is grouped and the Borough name is given to unnamed Neighbourhood

In [2]:
#Dropped "Not Assigned" Borough
df2.drop(df2[df2['Borough']=="Not assigned"].index,axis=0, inplace=True)
#grouped same Postcode
df3=df2.groupby("Postcode").agg(lambda x:','.join(set(x))).reset_index()
#Rename the Neighbourhood
df3.loc[df3['Neighbourhood']=="Not assigned",'Neighbourhood']=df3.loc[df3['Neighbourhood']=="Not assigned",'Borough']
df3.shape

(103, 3)

## Neighborhoods in Toronto - Part 2

Tried to use geocoder, but didn't work well

In [3]:
#import geocoder # import geocoder

#df_location = df3.copy()
#postcodes = df_location['Postcode'].tolist()

#for postcode in postcodes:
#    # initialize your variable to None
#    lat_lng_coords = None

#    # loop until you get the coordinates
#    print('{}, Toronto, Ontario'.format(postcode))
#    while(lat_lng_coords is None):
#      g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
   
#   lat_lng_coords = g.latlng

#    latitude = lat_lng_coords[0]
#    longitude = lat_lng_coords[1]
    
#    df_location.loc[df_location.Postcode == postcodes, 'Latitude'] = latitude
#    df_location.loc[df_location.Postcode == postcodes, 'Longitude'] = longitude

Then i used de csv file to import the Latitude and longitude information

In [4]:
lat_lng = pd.read_csv('C:/Users/girot/Desktop/Data Science/9.Applied Data Science Capstone/Week 3 - Neighborhood Segmentation and Clustering/Geospatial_Coordinates.csv')
lat_lng.rename(columns={'Postal Code':'Postcode'},inplace=True)
lat_lng.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


..and join with Data

In [6]:
df_location = pd.merge(df3, lat_lng, how='outer', on=['Postcode'])
df_location.shape

(103, 5)

In [19]:
df_location.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"West Hill,Guildwood,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Neighborhoods in Toronto - Part 3

In [None]:
conda install -c conda-forge folium 
conda install -c conda-forge geopy

In [16]:
import folium 
import json 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

### Definition of credencials and functions to use on Foursquare

In [56]:
### Define Foursquare Credentials and Version
CLIENT_ID = 'M0V3GQAG4NFCPEOSCT4LZBTYB1DASQ1K0CKQRO1TATNKL5BY'
CLIENT_SECRET = '0VSA2NQ1DX22ZO3XFWPVRBWZEE3PE0ARZWTNFKED1OLLGDU3' 
VERSION = '20180604'

# Limit of number of venues returned by Foursquare API
LIMIT = 100 
# Define radius of search for Foursquare API
radius = 1000

In [52]:
# Function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [53]:
# Function to get Nearby Venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [54]:
# Function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

Use the getNearbyVenues Function to get a table of venues

In [57]:
toronto_venues = getNearbyVenues(names=df_location['Postcode'],
                                 latitudes=df_location['Latitude'],
                                 longitudes=df_location['Longitude']
                                )

looking for better categories

In [216]:
toronto_venues.head()
toronto_category = toronto_venues[['Venue Category', 'Venue']]
#toronto_category.groupby('Venue Category').count()
toronto_bars_category = toronto_category.groupby('Venue Category').count().filter(like=('Restaurant'), axis=0).reset_index()
toronto_bars_category.shape

(54, 2)

Create a database with Restaurant categories

In [241]:
df_toronto = toronto_venues[['Venue Category','Neighborhood','Venue Latitude', 'Venue Longitude']]
df_toronto.rename(columns={'Venue Category':'Category', 'Venue Latitude':'Latitude', 'Venue Longitude':'Longitude'},  inplace=True)
df_categories = df_toronto[df_toronto_bar['Venue Category'].str.contains('Restaurant')]
df_categories.head()

Unnamed: 0,Category,Neighborhood,Latitude,Longitude
0,Fast Food Restaurant,M1B,43.807448,-79.199056
6,Mexican Restaurant,M1E,43.766299,-79.19072
13,Korean Restaurant,M1G,43.770812,-79.214502
14,Hakka Restaurant,M1H,43.774697,-79.241142
15,Caribbean Restaurant,M1H,43.775222,-79.241678


Clustering with kMeans

In [239]:
# Set number of clusters
kclusters = 5
#toronto_grouped_clustering = df_categories.drop('Neighborhood', 1)
toronto_grouped_clustering = df_categories.drop(['Category','Neighborhood'], 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
categories_merged = df_categories.copy()
categories_merged['Cluster_Labels'] = kmeans.labels_
categories_merged.head()

Unnamed: 0,Category,Neighborhood,Latitude,Longitude,Cluster_Labels
0,Fast Food Restaurant,M1B,43.807448,-79.199056,3
6,Mexican Restaurant,M1E,43.766299,-79.19072,3
13,Korean Restaurant,M1G,43.770812,-79.214502,3
14,Hakka Restaurant,M1H,43.774697,-79.241142,3
15,Caribbean Restaurant,M1H,43.775222,-79.241678,3


Creating a map with the concentration of restaurants in toronto

In [238]:
# create map
toronto_latitude = 43.70011
toronto_longitude = -79.4163
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)
#map_clusters

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(categories_merged['Latitude'], 
                                  categories_merged['Longitude'], 
                                  categories_merged['Neighborhood'], 
                                  categories_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

