# Capstone project for Coursera IBM Data Science

This will be used for the Coursera IBM Data Science Capstone Project

In [1]:
# Needed for geocoder.
#!conda install conda-forge::geocoder --yes

In [2]:
import pandas as pd
import numpy as np

# Table of Contents

1. Phase 1: Scrape and Transform
2. Phase 2: Apply Latitude and Longitude

# Phase 1: Scrape and Transform

### 1. Scrape postal codes and neighborhoods from Wikipedia
Out of all the tables, the first one with the text "Borough" has the data we need.

In [3]:
postal_codes_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
postal_codes_raw = pd.read_html(io=postal_codes_url, match="Borough")[0]    # Grab the first table
postal_codes_raw.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 2. Remove 'Not assigned' entries

In [4]:
# Verify that All Neighbourhoods that are 'Not assigned' also do not have a Borough
q = postal_codes_raw[(postal_codes_raw['Neighbourhood'] == 'Not assigned') & (postal_codes_raw['Neighbourhood'] != postal_codes_raw['Borough'])]
q

Unnamed: 0,Postcode,Borough,Neighbourhood


In [5]:
postal_codes_filter = postal_codes_raw[postal_codes_raw['Borough'] != 'Not assigned']
postal_codes_filter.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### 3. Consolidate by Postcode

In [6]:
# Group by columns
keycolumns=['Postcode','Borough']
# Preserve the keys with a multiindex
mi = pd.MultiIndex.from_frame(postal_codes_filter[keycolumns])
# Create an independent dataframe with the key as the index and only the neighborhood as the column
postal_codes = postal_codes_filter\
                    .copy()\
                    .set_index(mi)\
                    .drop(columns=keycolumns)
# Concatenate the neighborhoods with a comma
postal_codes_clean = postal_codes.groupby(keycolumns).aggregate(lambda x: x.str.cat(sep=','))
# Move the index back to columns
postal_codes_clean.reset_index(inplace=True,level='Borough')
postal_codes_clean.sort_values('Postcode',inplace=True)
postal_codes_clean.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


### 4. And the answer is:

In [7]:
postal_codes_clean.shape

(103, 2)

# Phase 2: Attach geocoding

**Note:** The geocoder package refused to work. Looks like it depends on an API Key that costs money

### 1. Get the Geocoded Postal Codes

In [8]:
geocode_url = "https://cocl.us/Geospatial_data"
geodata = pd.read_csv(geocode_url)
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 2. Attach geocoding to Neighbourhoods

In [9]:
# Rename columns, make it an index for the join, sort it
geodata_clean =geodata\
                .rename(columns={'Postal Code': 'Postcode'})\
                .set_index('Postcode')\
                .sort_values('Postcode')
geodata_clean.head()

Unnamed: 0_level_0,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [12]:
print(postal_codes_clean.shape)
print(geodata_clean.shape)

(103, 2)
(103, 2)


In [15]:
postal_codes_final = postal_codes_clean.join(geodata_clean, how='left').reset_index()
postal_codes_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
