Scrape table using pandas

In [2]:
import pandas as pd
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Ignoring cells with a borough that is Not assigned

In [3]:
df=df[df.Borough != 'Not assigned']
df = df.sort_values(by=['Postcode','Borough'])
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


More than one neighborhood can exist in one postal code area, so rows are combined into one row with the neighborhoods separated with a comma

In [4]:
df.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ','.join(x))

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


If a cell has a borough but a Not assigned neighborhood, then the neighborhood is the same as the borough

In [5]:
df['Neighbourhood']=df['Neighbourhood'].replace('Not assigned', df['Borough'])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [6]:
df.shape

(211, 3)

In [7]:
!pip install geocoder
import geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.1MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [8]:
pcode_list = df['Postcode'].tolist()

lat=[]
lon=[]

for code in pcode_list:
    
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
      lat_lng_coords = g.latlng
        
    lat.append(lat_lng_coords[0])
    lon.append(lat_lng_coords[1])
    
df['Latitude'] = lat
df['Longitude'] = lon

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.811525,-79.195517
1,M1B,Scarborough,Malvern,43.811525,-79.195517
2,M1C,Scarborough,Highland Creek,43.785665,-79.158725
3,M1C,Scarborough,Rouge Hill,43.785665,-79.158725
4,M1C,Scarborough,Port Union,43.785665,-79.158725


In [21]:
CLIENT_ID = 'RVY20ETWBXIX3CH001SDLK43P3U4RSZJR5SS4JFVU0JC5IQO' # your Foursquare ID
CLIENT_SECRET = 'LBQX5SMTTHBK4S4TQ5XFCI532K3HR1MK0J1QM1GQZ4V5HO4Z' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = '100'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RVY20ETWBXIX3CH001SDLK43P3U4RSZJR5SS4JFVU0JC5IQO
CLIENT_SECRET:LBQX5SMTTHBK4S4TQ5XFCI532K3HR1MK0J1QM1GQZ4V5HO4Z


In [31]:
import requests
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Rouge
Malvern
Highland Creek
Rouge Hill
Port Union
Guildwood
Morningside
West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park
Ionview
Kennedy Park
Clairlea
Golden Mile
Oakridge
Cliffcrest
Cliffside
Scarborough Village West
Birch Cliff
Cliffside West
Dorset Park
Scarborough Town Centre
Wexford Heights
Maryvale
Wexford
Agincourt
Clarks Corners
Sullivan
Tam O'Shanter
Agincourt North
L'Amoreaux East
Milliken
Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview
Henry Farm
Oriole
Bayview Village
Silver Hills
York Mills
Newtonbrook
Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park
Don Mills South
Bathurst Manor
Downsview North
Wilson Heights
Northwood Park
York University
CFB Toronto
Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens
Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West
Riverdale
The Beaches West
Indi

In [33]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rouge,43.811525,-79.195517,Wood Bison Paddock,43.811732,-79.200708,Zoo Exhibit
1,Rouge,43.811525,-79.195517,Canadian Appliance Source Whitby,43.808353,-79.191331,Home Service
2,Malvern,43.811525,-79.195517,Wood Bison Paddock,43.811732,-79.200708,Zoo Exhibit
3,Malvern,43.811525,-79.195517,Canadian Appliance Source Whitby,43.808353,-79.191331,Home Service
4,Highland Creek,43.785665,-79.158725,Affordable Toronto Movers,43.787919,-79.162977,Moving Target


In [35]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Zoo Exhibit,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,Arts & Crafts Store,...,Tram Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Zoo Exhibit,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,American Restaurant,Antique Shop,Art Gallery,Art Museum,...,Tram Station,Transportation Service,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.000000,0.000000,0.00,0.030000,0.0,0.010000,0.00,...,0.0,0.0,0.010000,0.000000,0.000000,0.01000,0.00,0.000000,0.000000,0.000000
1,Agincourt,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.058824,0.00000,0.00,0.000000,0.000000,0.000000
2,Agincourt North,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
3,Albion Gardens,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
4,Alderwood,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
5,Bathurst Quay,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.014706,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.014706
6,Bayview Village,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
7,Beaumond Heights,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
8,Bedford Park,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.00,...,0.0,0.0,0.000000,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000
9,Berczy Park,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.016129,0.00,...,0.0,0.0,0.016129,0.000000,0.000000,0.00000,0.00,0.000000,0.000000,0.000000


In [39]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Hotel,Steakhouse,Restaurant,Burger Joint,Bar,Breakfast Spot,Asian Restaurant,Gastropub
1,Agincourt,Shopping Mall,Supermarket,Chinese Restaurant,Hong Kong Restaurant,Pool,Shanghai Restaurant,Sushi Restaurant,Bakery,Bubble Tea Shop,Badminton Court
2,Agincourt North,Pharmacy,Yoga Studio,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Event Space,Farm
3,Albion Gardens,Grocery Store,Pizza Place,Liquor Store,Fried Chicken Joint,Beer Store,Sandwich Place,Japanese Restaurant,Coffee Shop,Fast Food Restaurant,Park
4,Alderwood,Sandwich Place,Convenience Store,Pub,Gym,Ethiopian Restaurant,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store


In [41]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 3, 2, 2, 2, 0, 2, 2, 2], dtype=int32)

In [64]:


toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood', how = 'right')
toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,Rouge,43.811525,-79.195517,2,Zoo Exhibit,Home Service,Hobby Shop,Flea Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
1,M1B,Scarborough,Malvern,43.811525,-79.195517,2,Zoo Exhibit,Home Service,Hobby Shop,Flea Market,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
2,M1C,Scarborough,Highland Creek,43.785665,-79.158725,2,Construction & Landscaping,Moving Target,Bar,Falafel Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Event Space,Farm,Food
3,M1C,Scarborough,Rouge Hill,43.785665,-79.158725,2,Construction & Landscaping,Moving Target,Bar,Falafel Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Event Space,Farm,Food
4,M1C,Scarborough,Port Union,43.785665,-79.158725,2,Construction & Landscaping,Moving Target,Bar,Falafel Restaurant,Electronics Store,Elementary School,Ethiopian Restaurant,Event Space,Farm,Food


In [65]:
!pip install folium



In [66]:
!pip install geopy



In [67]:
from geopy.geocoders import Nominatim
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


In [68]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        #color=rainbow[cluster-1],
        color = rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters