# import data

In [3]:
import time
import pandas as pd
import numpy as np

## Load data from html

In [4]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = pd.read_html(url,header=0,encoding='utf-8',na_values='Not assigned')

In [5]:
data[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [6]:
Toronto_df = pd.DataFrame(data[0])
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Process data

### Drop the Not assigned Borough

In [7]:
Toronto_df.dropna(subset=['Borough'], inplace=True)
Toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
Toronto_df[Toronto_df['Borough']==np.nan]

Unnamed: 0,Postal Code,Borough,Neighbourhood


### Assign the empty Neighbourhood with Borough

In [9]:
Toronto_df[Toronto_df['Neighbourhood']==np.nan]

Unnamed: 0,Postal Code,Borough,Neighbourhood


It's fine

In [10]:
Toronto_df[Toronto_df['Postal Code']=='M8Z']

Unnamed: 0,Postal Code,Borough,Neighbourhood
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Process Neighbourhood data

Split

In [11]:
df_neighbour = Toronto_df['Neighbourhood'].str.split(',',expand=True)
df_neighbour.head()

Unnamed: 0,0,1,2,3,4,5,6,7
2,Parkwoods,,,,,,,
3,Victoria Village,,,,,,,
4,Regent Park,Harbourfront,,,,,,
5,Lawrence Manor,Lawrence Heights,,,,,,
6,Queen's Park,Ontario Provincial Government,,,,,,


Transpose

In [12]:
df_neighbour=df_neighbour.stack()
df_neighbour

2    0                    Parkwoods
3    0             Victoria Village
4    0                  Regent Park
     1                 Harbourfront
5    0               Lawrence Manor
                    ...            
178  0                    Mimico NW
     1           The Queensway West
     2               South of Bloor
     3     Kingsway Park South West
     4        Royal York South West
Length: 219, dtype: object

reset index

In [13]:
df_neighbour = df_neighbour.reset_index(level=1,drop=True)
df_neighbour

2                      Parkwoods
3               Victoria Village
4                    Regent Park
4                   Harbourfront
5                 Lawrence Manor
                 ...            
178                    Mimico NW
178           The Queensway West
178               South of Bloor
178     Kingsway Park South West
178        Royal York South West
Length: 219, dtype: object

Combine

In [14]:
df_neighbour.name='Neighbourhood'
toronto = Toronto_df.drop(['Neighbourhood'], axis=1).join(df_neighbour)
toronto.reset_index(drop=True,inplace=True)
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Manor


Let's show the dataframe shape

In [15]:
toronto.shape

(219, 3)

# Get Geo information

## Get geocoder

In [16]:
import geocoder # import geocoder
from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values

from arcgis.geocoding import geocode
from arcgis.gis import GIS
gis = GIS()

In [17]:
toronto[toronto['Postal Code']=='M5G']

Unnamed: 0,Postal Code,Borough,Neighbourhood
42,M5G,Downtown Toronto,Central Bay Street


### Function to get the geo 2D position

In [18]:
# For France
def get_2D_FR(address):
    lat_coords = 0
    lng_coords = 0
    g = geocode(address='{}, France'.format(address))[0]
    lng_coords = g['location']['x']
    lat_coords = g['location']['y']
    return [str(lat_coords), str(lng_coords)]

In [19]:
# For Canada
def get_2D_CA(address):
    lat_coords = 0
    lng_coords = 0
    g = geocode(address='{}, Canada'.format(address))[0]
    lng_coords = g['location']['x']
    lat_coords = g['location']['y']
    return [str(lat_coords), str(lng_coords)]

Test the geo information with the geo position given by instructor, it seems correct

In [20]:
get_2D_CA('M5G')

['43.65609000000006', '-79.38492999999994']

Get the postal code series

In [21]:
toronto_postalcode = toronto['Postal Code']
toronto_postalcode.head()

0    M3A
1    M4A
2    M5A
3    M5A
4    M6A
Name: Postal Code, dtype: object

### Query geo 2D position

According the postal code, retrieve the 2D geo positions

In [22]:
toronto_geo_2D = toronto_postalcode.apply(lambda x: get_2D_CA(x))
toronto_geo_2D.head()

0    [43.75245000000007, -79.32990999999998]
1    [43.73057000000006, -79.31305999999995]
2    [43.65512000000007, -79.36263999999994]
3    [43.65512000000007, -79.36263999999994]
4    [43.72327000000007, -79.45041999999995]
Name: Postal Code, dtype: object

Merge the tow dataframe into one

In [23]:
toronto_geo_2D.name='geo_2D'
toronto_merged = pd.concat([toronto,toronto_geo_2D], axis=1)
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,geo_2D
0,M3A,North York,Parkwoods,"[43.75245000000007, -79.32990999999998]"
1,M4A,North York,Victoria Village,"[43.73057000000006, -79.31305999999995]"
2,M5A,Downtown Toronto,Regent Park,"[43.65512000000007, -79.36263999999994]"
3,M5A,Downtown Toronto,Harbourfront,"[43.65512000000007, -79.36263999999994]"
4,M6A,North York,Lawrence Manor,"[43.72327000000007, -79.45041999999995]"


In [24]:
toronto.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
214,M8Z,Etobicoke,Mimico NW
215,M8Z,Etobicoke,The Queensway West
216,M8Z,Etobicoke,South of Bloor
217,M8Z,Etobicoke,Kingsway Park South West
218,M8Z,Etobicoke,Royal York South West


### Construct the final dataframe toronto

In [25]:
toronto_merged['latitude'] = toronto_merged['geo_2D'].apply(lambda x: float(x[0]))
toronto_merged['longitude'] = toronto_merged['geo_2D'].apply(lambda x: float(x[1]))
toronto_merged.drop(['geo_2D'], axis=1, inplace=True)
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,Regent Park,43.65512,-79.36264
3,M5A,Downtown Toronto,Harbourfront,43.65512,-79.36264
4,M6A,North York,Lawrence Manor,43.72327,-79.45042


In [26]:
print(toronto.shape)
print(toronto_merged.shape)

(219, 3)
(219, 5)


Check the rows for before and after the combination, nothing missed. it's great!

# Create a map of Toronto

In [27]:
toronto_ll = get_2D_CA('toronto')
toronto_ll

['43.648690000000045', '-79.38543999999996']

In [28]:
toronto_merged.columns

Index(['Postal Code', 'Borough', 'Neighbourhood', 'latitude', 'longitude'], dtype='object')

In [29]:
import folium # map rendering library

map_toronto = folium.Map(location=toronto_ll, zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_merged['latitude'], toronto_merged['longitude'], 
                                           toronto_merged['Borough'], toronto_merged['Neighbourhood']):
    
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [33]:
from IPython.display import Image
Image(url= "images/Toronto_map.png")