In [25]:
# Import required libraries

import urllib.request, urllib.parse, urllib.error
import bs4 as bs
import ssl
import numpy as np 
import pandas as pd 
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.4.1               |             py_0          26 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

    altair:  4.1.0-py_1 conda-forge
    branca:  0.4.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
branca-0.4.1         | 26 KB     | #####

## Use the Notebook to build the code to scrape the following Wikipedia page: 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 

In [8]:

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE


html = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', context=ctx).read()
soup = bs.BeautifulSoup(html, 'html.parser')

# Retrieve all of the table rows and put them into a pandas dataframe
row_list = []
table_rows = soup('tr')
for row in table_rows:
    td = row.find_all('td')
    row = [tr.text for tr in td]
    row_list.append(row)
Toronto_DF = pd.DataFrame(row_list)



## Process the scraped dataframe to satisfy the following requirements:

1- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.  
2- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.  
3- More than one neighborhood can exist in one postal code area.   
4- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.  
5- Use the .shape method to print the number of rows of your dataframe.

In [9]:
#Clean up the dataframe

#Just keep required columns
Toronto_DF = Toronto_DF[[0,1,2]]

#Rename columns
Toronto_DF.columns=["PostalCode", "Borough", "Neighborhood"]

#Remove any '\n' in the strings
Toronto_DF = Toronto_DF.replace('\n','', regex=True)

#Drop rows that are not required
Toronto_DF = Toronto_DF.dropna()
Toronto_DF = Toronto_DF[Toronto_DF.Borough != 'Not assigned']
Toronto_DF = Toronto_DF[Toronto_DF.Borough != 'B' ]
Toronto_DF = Toronto_DF[Toronto_DF.Borough != 'NL']
Toronto_DF = Toronto_DF[Toronto_DF.Borough != 'NS']

#Group rows if required
Toronto_DF['Neighborhood'] = Toronto_DF.groupby(['PostalCode','Borough'])['Neighborhood'].transform(lambda x: ','.join(x))
Toronto_DF = Toronto_DF.drop_duplicates()

Toronto_DF.shape

(103, 3)

## Get the latitude and the longitude coordinates of each postal code, using geocoder:

In [10]:
#Install geocoder (uncomment the following line to install geocoder if is not installed aleady)
#!conda install -c conda-forge geocoder



In [11]:
# import geocoder
import geocoder 

# Define a function to convert PostalCode to lat and Long
def Postal_Code_to_LatLong(postal_code):
    
 # initialize your variable to None
 lat_lng_coords = None
    
    
# Loop until you get the coordinates
 while(lat_lng_coords is None):
     g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
     lat_lng_coords = g.latlng
 return lat_lng_coords


In [12]:
# Use the defined function to add latitude and longitude for each postal code
Toronto_DF[['Latitude', 'Longitude']] =  pd.DataFrame(Toronto_DF['PostalCode'].apply(Postal_Code_to_LatLong).to_list(), index=Toronto_DF.index)

Toronto_DF

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
3,M3A,North York,Parkwoods,43.752935,-79.335641
4,M4A,North York,Victoria Village,43.728102,-79.311890
5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
6,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.661790,-79.389390
9,M9A,Etobicoke,Islington Avenue,43.667481,-79.528953
10,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
12,M3B,North York,Don Mills,43.748900,-79.357220
13,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
14,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


## Count the number of boroughs and neighborhoods:

In [13]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_DF['Borough'].unique()),
        Toronto_DF.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## Get the latitude and longitude values of Toronto City:

In [17]:
address = 'Toronto, ON, Canada'
location = geocoder.arcgis(address)
lat_lng_coords = location.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 43.648690000000045, -79.38543999999996.


## Create a map of Toronto with neighborhoods superimposed on top:

In [26]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_DF['Latitude'], Toronto_DF['Longitude'], Toronto_DF['Borough'], Toronto_DF['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [35]:
# The code was removed by Watson Studio for sharing.

## Create a function to get nearby venues for each neighborhood:

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Run the above function on each neighborhood in Toronto and create a new dataframe called "toronto_venues":

In [38]:
toronto_venues = getNearbyVenues(names=Toronto_DF['Neighborhood'],
                                   latitudes=Toronto_DF['Latitude'],
                                   longitudes=Toronto_DF['Longitude']
                                  )

#remove the row with venue category identified as 'Neighborhood' because it causes issues in the next steps
toronto_venues = toronto_venues[toronto_venues['Venue Category'] != 'Neighborhood' ]

#check the size of the resulting dataframe
print(toronto_venues.shape)
toronto_venues.head()

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.752935,-79.335641,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.752935,-79.335641,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.752935,-79.335641,Sun Life,43.75476,-79.332783,Construction & Landscaping
3,Parkwoods,43.752935,-79.335641,MacLeod Exteriors Inc.,43.755014,-79.338688,Construction & Landscaping
4,Victoria Village,43.728102,-79.31189,Tim Hortons,43.725517,-79.313103,Coffee Shop


## How many venues were returned for each neighborhood:

In [69]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",8,8,8,8,8,8
"Bathurst Manor, Wilson Heights, Downsview North",20,20,20,20,20,20
Bayview Village,2,2,2,2,2,2
"Bedford Park, Lawrence Manor East",20,20,20,20,20,20
Berczy Park,65,65,65,65,65,65
"Birch Cliff, Cliffside West",4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",42,42,42,42,42,42
Business reply mail Processing Centre,100,100,100,100,100,100
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",63,63,63,63,63,63


## How many unique categories can be curated from all the returned venues:

In [70]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 259 uniques categories.


##  Analyze Each Neighborhood:

In [72]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")


# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]


toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Find the overall shape of the dataframe:

In [73]:
toronto_onehot.shape

(2249, 260)

## Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category:

In [74]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
1,"Alderwood, Long Branch",0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
2,"Bathurst Manor, Wilson Heights, Downsview North",0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.050000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
3,Bayview Village,0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
4,"Bedford Park, Lawrence Manor East",0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
5,Berczy Park,0.00000,0.000000,0.000000,0.000000,0.0,0.015385,0.00,0.000000,0.000000,...,0.00,0.015385,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.015385
6,"Birch Cliff, Cliffside West",0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
7,"Brockton, Parkdale Village, Exhibition Place",0.02381,0.000000,0.000000,0.000000,0.0,0.023810,0.00,0.023810,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000
8,Business reply mail Processing Centre,0.00000,0.000000,0.000000,0.020000,0.0,0.010000,0.00,0.000000,0.020000,...,0.00,0.020000,0.000000,0.000000,0.000000,0.010000,0.00,0.000000,0.000000,0.000000
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.00000,0.000000,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.015873,...,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000,0.000000


## Function to sort the venues in descending order:

In [75]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

## create the new dataframe and display the top 10 venues for each neighborhood:

In [165]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Sushi Restaurant,Skating Rink,Badminton Court,Supermarket,Breakfast Spot,Fast Food Restaurant,Field,Farmers Market,Fish & Chips Shop,Donut Shop
1,"Alderwood, Long Branch",Coffee Shop,Convenience Store,Sandwich Place,Pub,Gym,Gas Station,Pharmacy,Pizza Place,Farm,Farmers Market
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pharmacy,Deli / Bodega,Shopping Mall,Middle Eastern Restaurant,Sandwich Place,Supermarket,Sushi Restaurant,Fried Chicken Joint
3,Bayview Village,Construction & Landscaping,Trail,Yoga Studio,Farm,Eastern European Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Falafel Restaurant,Farmers Market
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Comfort Food Restaurant,Juice Bar,Restaurant,Café,Thai Restaurant,Pub,Sports Club


## Cluster Neighborhoods, run k-means to cluster the neighborhood into 5 clusters:


In [166]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 4, 0, 4, 4, 4, 4, 4, 4], dtype=int32)

## Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood:

In [167]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = Toronto_DF

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.dropna(inplace=True)

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,M3A,North York,Parkwoods,43.752935,-79.335641,0.0,Construction & Landscaping,Food & Drink Shop,Park,Falafel Restaurant,Donut Shop,Eastern European Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Farm
4,M4A,North York,Victoria Village,43.728102,-79.31189,4.0,Coffee Shop,French Restaurant,Pizza Place,Portuguese Restaurant,Intersection,Park,Donut Shop,Eastern European Restaurant,Electronics Store,Elementary School
5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,4.0,Pub,Coffee Shop,Athletics & Sports,Café,French Restaurant,Chocolate Shop,Intersection,Bakery,Tech Startup,Distribution Center
6,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211,4.0,Clothing Store,Cosmetics Shop,Pharmacy,Food Court,Bookstore,Restaurant,Toy / Game Store,American Restaurant,Men's Store,Furniture / Home Store
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,4.0,Coffee Shop,Café,Sushi Restaurant,Sandwich Place,Italian Restaurant,Fried Chicken Joint,Bookstore,Smoothie Shop,Burrito Place,Yoga Studio


## Visualize the resulting clusters:

In [168]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters