# Capstone project for Coursera IBM Data Science

This will be used for the Coursera IBM Data Science Capstone Project

In [34]:
import pandas as pd
import numpy as np

# Table of Contents

1. Phase 1: Scrape and Transform
2. Phase 2: Apply Latitude and Longitude

# Phase 1: Scrape and Transform

### 1. Scrape postal codes and neighborhoods from Wikipedia
Out of all the tables, the first one with the text "Borough" has the data we need.

In [44]:
postal_codes_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
postal_codes_raw = pd.read_html(io=postal_codes_url, match="Borough")[0]    # Grab the first table
postal_codes_raw.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 2. Remove 'Not assigned' entries

In [50]:
postal_codes_clean = postal_codes_raw[postal_codes_raw['Borough'] != 'Not assigned']
postal_codes_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### 3. Consolidate by Postcode

In [123]:
# Group by columns
keycolumns=['Postcode','Borough']
# Preserve the keys with a multiindex
mi = pd.MultiIndex.from_frame(postal_codes_clean[keycolumns])
# Create an independent dataframe with the key as the index and only the neighborhood as the column
postal_codes = postal_codes_clean\
                    .copy()\
                    .set_index(mi)\
                    .drop(columns=keycolumns)
# Concatenate the neighborhoods with a comma
postal_codes_final = postal_codes.groupby(keycolumns).transform(lambda x: x.str.cat(sep=','))
# Move the index back to columns
postal_codes_final.reset_index(inplace=True)
postal_codes_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M6A,North York,"Lawrence Heights,Lawrence Manor"


### 4. And the answer is:

In [124]:
postal_codes_final.shape

(210, 3)