# Coursera Capstone Project

<p> This Notebook will be used primarily for the Coursera Capstone project for the IBM Data Science Specialization</P>

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Scraping Wikipedia for the postal codes

In [3]:
from bs4 import BeautifulSoup
import requests

Import the lxml file from wikipedia

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')


Test to find table and first postal code

In [5]:
table = soup.find('table')
postcode = table.td.text
print(postcode)

M1A



Create a list for the postal code, borough, and nighbourhood. 
Add them to a pandas dataframe

In [6]:
postalCodeL = []
boroughL = []
neighbourhoodL = []

In [7]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if (len(cells) >0):
        postalCodeL.append(cells[0].text.rstrip('\n')) #removes newlines at the end of the cell
        boroughL.append(cells[1].text.rstrip('\n'))
        neighbourhoodL.append(cells[2].text.rstrip('\n').replace(' /', ',')) #Also changes the "/" from Wikipedia to ","

In [8]:
toronto_df = pd.DataFrame({"PostalCode": postalCodeL,
                          "Borough": boroughL,
                          "Neighbourhood":neighbourhoodL})
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


Remove any columns if Borough is 'Not assigned'. Reset index to tidy up table.

In [9]:
notassigned = toronto_df[toronto_df['Borough']=='Not assigned']
torontona_df = toronto_df.drop(notassigned.index, axis=0)
torontof_df = torontona_df.reset_index(drop=True)
torontof_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Wikipedia no longer has duplicate Postal Codes on their table, and there are no unassigned neighbourhoods. Some of the cleaning no longer needs to be done.

In [10]:
for nh in torontof_df['Neighbourhood']:
    print(nh)

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

Checking with the example

In [11]:
column_names = ["PostalCode", "Borough", "Neighbourhood"]
test_df = pd.DataFrame(columns=column_names)
expcs = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A']
for pc in expcs:
    test_df = test_df.append(torontof_df[torontof_df["PostalCode"]==pc], ignore_index = True)
test_df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


Print the number of rows in the dataframe

In [12]:
torontof_df.shape

(103, 3)

## Importing Geospatial Data

In [21]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging tables together

In [22]:
coordinates_df.rename(columns = {"Postal Code":"PostalCode"}, inplace = True)
coordinates_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [23]:
torontoc_df = torontof_df.merge(coordinates_df, on="PostalCode", how="left")
torontoc_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Testing 

In [24]:
column_names = ["PostalCode", "Borough", "Neighbourhood", "Latitude", "Longitude"]
test2_df = pd.DataFrame(columns=column_names)
for postcode in expcs:
    test2_df = test2_df.append(torontoc_df[torontoc_df["PostalCode"]==postcode], ignore_index=True)
    
test2_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
