# Applied Data Science Capstone, Week 2 Assignment - Part 2

## scrape Wikipedia to get neighborhood informantion of Toronto

[List of neighborhood of Toronto](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [1]:
# find the table with BeautifulSoup. 

from bs4 import BeautifulSoup
import pandas as pd
import requests

scrape_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(scrape_url)
soup = BeautifulSoup(html.content, 'html5lib')
tables = soup.findAll('table', {'class': 'wikitable'})[0]

In [2]:
# extract headers

header = []
for item in tables.find('tr').findAll('th'):
    header.append(item.text.strip())

# extract rows

tbl = []
for row in tables.findAll('tr'):
    r = []
    items = row.findAll('td')
    for item in items:
        r.append(item.text.strip())
    tbl.append(r)

In [3]:
# load table as pd.DataFrame

neighborhood = pd.DataFrame(data=tbl, columns=header)

In [4]:
# clean data

# Borough Not assigned = pd.NaT
neighborhood.replace(to_replace={'Borough':'Not assigned'}, value={'Borough': pd.NaT}, inplace=True)
neighborhood.dropna(axis=0, inplace=True)

# Neighbourhood Not assigned = Borough
condition = neighborhood['Neighbourhood'] == 'Not assigned'
neighborhood.loc[condition, 'Neighbourhood'] = neighborhood.loc[condition, 'Borough']

In [5]:
# merge neighbourhood if Postcode is the same.

def mergePostcode(x):
    return pd.Series(
        dict(Borough = x['Borough'].value_counts().index[0], 
             Neighbourhood = "%s" % ', '.join(x['Neighbourhood']))
    )

df = neighborhood.groupby('Postcode').apply(mergePostcode)
df.reset_index(inplace=True)

__df__ stores the cleaned information about neighbourhood of Toronto.

In [6]:
df.shape

(103, 3)

## get latitude/longitude from Postcode and save to df

In [7]:
import googlemaps
import googlemaps_secret
import os

API_KEY = os.getenv('GOOGLE_API')
GMAPS = googlemaps.Client(key=API_KEY)

def addr2ll(address):
    result = GMAPS.geocode(address)
    if result:
        return result[0]['geometry']['location']
    else:
        return None

In [8]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
from time import sleep

lat = []
lng = []
for i in range(df.index.size):
    addr = '{}, Toronto, Ontario'.format(df.loc[i, 'Postcode'])
    latlng = addr2ll(addr)
    lat.append(latlng['lat'])
    lng.append(latlng['lng'])
    sleep(0.1)
lat

[43.8066863,
 43.78453510000001,
 43.7635726,
 43.7709921,
 43.773136,
 43.7447342,
 43.7279292,
 43.7111117,
 43.716316,
 43.692657,
 43.7574096,
 43.7500715,
 43.7942003,
 43.7816375,
 43.8152522,
 43.7995252,
 43.8361247,
 43.8037622,
 43.7785175,
 43.7869473,
 43.7574902,
 43.789053,
 43.7701199,
 43.7527583,
 43.7827364,
 43.7532586,
 43.7459058,
 43.7258997,
 43.7543283,
 43.7679803,
 43.7374732,
 43.7390146,
 43.7284964,
 43.7616313,
 43.72588229999999,
 43.7063972,
 43.6953439,
 43.6763574,
 43.7090604,
 43.7053689,
 43.685347,
 43.6795571,
 43.6689985,
 43.6595255,
 43.7280205,
 43.7127511,
 43.7153834,
 43.7043244,
 43.6895743,
 43.6864123,
 43.6795626,
 43.667967,
 43.6658599,
 43.6542599,
 43.6571618,
 43.6514939,
 43.6447708,
 43.6579524,
 43.65057119999999,
 43.6408157,
 43.6471768,
 43.6481985,
 43.7332825,
 43.7116948,
 43.6969476,
 43.6727097,
 43.6626956,
 43.6532057,
 43.6289467,
 43.6464352,
 43.6484292,
 43.718518,
 43.709577,
 43.6937813,
 43.68902560000001,
 43.6

In [11]:
df = df.assign(Latitude=lat, Longitude=lng)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
