# Scarping Postcode, Burough and Neighborhoods in Canada

In [109]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
import geocoder 
import folium

### Creating Beautiful Soup objects and passing URL

In [91]:
ca_url = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
ca = BeautifulSoup(ca_url, 'html.parser')
ca_tables = ca.find_all('table')

### Creating lists to hold scraped data

In [92]:
postcodes = []
boroughs = []
nhs = []

### Iterating over rows in wikipedia table, placing data in lists

In [93]:
for table in ca_tables:
    rows = table.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')
        
        if len(cells) > 1:
            postcode = cells[0]
            postcodes.append(postcode.text.strip())
            
            borough = cells[1]
            boroughs.append(borough.text.strip())
            
            nh = cells[-1]
            nhs.append(nh.text.strip())

# Creating a new dataframe from the postcodes list 
df3 = pd.DataFrame(postcodes, columns=['Postcodes'])

### Adding list data from Boroughs and Neighborhoods to df3 dataframe

In [94]:
df3['Boroughs'] = boroughs
df3['Neighborhoods'] = nhs

### Removing extranious data from other tables 

In [95]:
df4 = df3.iloc[0:287]

In [96]:
df4

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


### Removing and cleaning duplicate data 

In [97]:
# 'dropBoNh' drops postcodes without assigned buroughs and Neighborhoods 
dropBoNh = df4[(df4['Boroughs'] == 'Not assigned') & (df4['Neighborhoods'] == 'Not assigned')].index

df4.drop(dropBoNh, inplace=True)
df4

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


In [98]:
#double checking dataframe to ensure all unassigned values have been removed. 
df4[df4['Boroughs'] == 'Not assigned']


Unnamed: 0,Postcodes,Boroughs,Neighborhoods


In [99]:
#double checking dataframe to ensure all unassigned values have been removed. 
df4[df4['Postcodes'] == 'Not assigned']

Unnamed: 0,Postcodes,Boroughs,Neighborhoods


In [168]:
#checking dataframe for duplicate neighborhoods values across idencial buroughs and postcodes
df4[df4['Postcodes'] == 'M9V']

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
227,M9V,Etobicoke,Albion Gardens
228,M9V,Etobicoke,Beaumond Heights
229,M9V,Etobicoke,Humbergate
230,M9V,Etobicoke,Jamestown
231,M9V,Etobicoke,Mount Olive
232,M9V,Etobicoke,Silverstone
233,M9V,Etobicoke,South Steeles
234,M9V,Etobicoke,Thistletown


In [101]:
#grouping data by postcode. Duplicate Buroughs and Neighborhoods are combined and seperated by commas. 
df5 = df4.groupby(df4['Postcodes'], as_index=False).agg({'Boroughs': [(','.join)] ,  'Neighborhoods': [(','.join)] })
#reseting the column values
df5.columns = ['Postcodes', 'Boroughs', 'Neighborhoods']
df5

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
0,M1B,"Scarborough,Scarborough","Rouge,Malvern"
1,M1C,"Scarborough,Scarborough,Scarborough","Highland Creek,Rouge Hill,Port Union"
2,M1E,"Scarborough,Scarborough,Scarborough","Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,"Etobicoke,Etobicoke,Etobicoke,Etobicoke","Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,"Etobicoke,Etobicoke,Etobicoke,Etobicoke,Etobic...","Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [102]:
#removing the duplicate values in the Boroughs column 
df5['Boroughs'] = df5['Boroughs'].str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")
df5

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [103]:
#verifying that data is acurate and free of duplicate values.
df5[df5['Boroughs'] == 'Downtown Toronto']

Unnamed: 0,Postcodes,Boroughs,Neighborhoods
50,M4W,Downtown Toronto,Rosedale
51,M4X,Downtown Toronto,"Cabbagetown,St. James Town"
52,M4Y,Downtown Toronto,Church and Wellesley
53,M5A,Downtown Toronto,Harbourfront
54,M5B,Downtown Toronto,"Ryerson,Garden District"
55,M5C,Downtown Toronto,St. James Town
56,M5E,Downtown Toronto,Berczy Park
57,M5G,Downtown Toronto,Central Bay Street
58,M5H,Downtown Toronto,"Adelaide,King,Richmond"
59,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station"


In [104]:
df5.shape

(103, 3)

In [105]:
#geocoder test code 
import geocoder
g = geocoder.arcgis('Downtown Toronto, M5G')
g.lng

-79.38492999999994

In [106]:
#Creating lists to temporaraly hold the lat lng data. 
list_lat = []
list_lng = []

#Creating a for loop to iterate over the rows 
for index, row in df5.iterrows():
    
    postcode = row['Postcodes']#passing in postcodes column from df5
    borough = row['Boroughs']#passing in boroughs column from df5
    
    query = str(borough)+', '+str(postcode)#combining the data from both columns into one string 
    
    g = geocoder.arcgis(query)#passing string into geocoder, assigning output to verrible 'g'
    
    lat = g.lat #pulling out lat value 
    lng = g.lng
    
    list_lat.append(lat)#appending lat value to the list created above
    list_lng.append(lng)

df5['Latitude'] = list_lat#appending lat value to the list created above
df5['Longitude'] = list_lng


In [107]:
df5

Unnamed: 0,Postcodes,Boroughs,Neighborhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
...,...,...,...,...,...
98,M9N,York,Weston,43.704845,-79.517546
99,M9P,Etobicoke,Westmount,43.696505,-79.530252
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.686810,-79.557284
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.743145,-79.584664


In [108]:
df5[df5['Boroughs'] == 'East Toronto']

Unnamed: 0,Postcodes,Boroughs,Neighborhoods,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.687863,-79.27216
41,M4K,East Toronto,"The Danforth West,Riverdale",43.683178,-79.355105
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.667965,-79.314667
43,M4M,East Toronto,Studio District,43.65903,-79.34901
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.64869,-79.38544


In [221]:
ca_map = folium.Map([43.8115, -79.1955], zoom_start=10)

for each in df5.iterrows():
       folium.features.CircleMarker(
            location =[each[1]['Latitude'],each[1]['Longitude']],
            radius=5,
       
           #popup=each[1]['Neighborhoods']
       
       
           
           ).add_to(ca_map)



ca_map