In [1]:
import pandas as pd

In [2]:
# !conda install -c conda-forge BeautifulSoup4 --yes # 
from bs4 import BeautifulSoup # import BeautifulSoup library

In [3]:
# install a parser
# !conda install -c conda-forge html5lib --yes # 

In [4]:
# install request library
# !conda install -c conda-forge request --yes
import requests

## Scrape Neighborhood Information

In [5]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text # Get the source html code
# add on .text to get the source code from response objec

In [6]:
soup = BeautifulSoup(source, 'lxml')

In [7]:
# Inspect the soup object and identify the "table" class where the desired information is located.
table = soup.find('table')

In [8]:
rowsList = [] # append rows of table
for rows in table.find_all('tr'):
    rowsList.append(rows.text)

In [9]:
pstcd = []
Borough = []
Neighborhood = []
Neighborhood_data = {'postcode':[], 'Borough':[], 'Neighborhood':[]} # neighborhood data
# rowslist includes '\n'. 
# Split using '\n' and extract the postcode, Borough and Neighborhood text for every row.
for row in rowsList:
    pstcd.append(row.split('\n')[1])
    Borough.append(row.split('\n')[2])
    Neighborhood.append(row.split('\n')[3])

# Fill dictionary    
Neighborhood_data['postcode'] = pstcd[1:]
Neighborhood_data['Borough'] = Borough[1:]
Neighborhood_data['Neighborhood'] = Neighborhood[1:]

# Make dataframe of neighborhood data using dictionary
Neighbors = pd.DataFrame(data = Neighborhood_data)

In [10]:
Neighbors.head(5)

Unnamed: 0,postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [14]:
# Change column name
Neighbors.columns=['PostalCode', 'Borough', 'Neighborhood']
Neighbors_orig = Neighbors
Neighbors.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Drop the rows where Borough is "Not assigned"

In [15]:
# find index of rows where Borough is 'Not assigned'
drop_index = (Neighbors[Neighbors['Borough'].str.match('Not assigned')]).index 

# drop the rows containing 'Not assigned' in 'Borough' column
Neighbors.drop(drop_index, inplace = True)
Neighbors.reset_index(inplace = True, drop=True)


### Combine neighborhoods that share the same PostalCode

In [33]:
Neighbors = Neighbors.groupby('PostalCode').agg({'Borough':'first',
                                        'Neighborhood': ', '.join}).reset_index()

In [34]:
Neighbors.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Assign Borough name to Neighborhood name when the latter is "Not assigned"

In [42]:
# Check the cells that have a borough, but a Not assigned neighborhood. 
#Assign the Borough name to Neighborhood.

# find index of the rows where Neighborhood is "Not assigned"
tempInd = (Neighbors[Neighbors['Neighborhood'].str.match('Not assigned')]).index

# assign the Borough name to Neighborhood name
Neighbors.iloc[tempInd, 2] = Neighbors.iloc[tempInd, 1]

### Number of rows:

In [49]:
Neighbors.shape

(103, 3)

### Download geographical coordinates
Ideally Google API (not free) of geocoder (free, but not consistent, multiple requests are needed sometimes) should be used for this. But for now we use the already downloaded coordinates in csv format from http://cocl.us/Geospatial_data

In [54]:
# Download geographical coordinates:
!wget -O geo_coord_Toronto.csv  http://cocl.us/Geospatial_data
print('Download compelete!')
coor_toronto = pd.read_csv('geo_coord_Toronto.csv')




--2019-04-04 11:39:18--  http://cocl.us/Geospatial_data
Resolving cocl.us... 169.48.113.201
Connecting to cocl.us|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2019-04-04 11:39:18--  https://cocl.us/Geospatial_data
Connecting to cocl.us|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-04-04 11:39:21--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com... 107.152.26.197
Connecting to ibm.box.com|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-04-04 11:39:21--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.co

In [67]:
print(coor_toronto.shape)
coor_toronto.columns = ['PostalCode', 'Latitude', 'Longitude']
coor_toronto.head(5)

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Join Neighborhood and coordinate tables

In [68]:
# Keep a copy of Neighbors before joining with coordinate table
x = Neighbors.copy()

In [80]:
coor_toronto.dtypes

PostalCode     object
Latitude      float64
Longitude     float64
dtype: object

In [81]:
Neighbors.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [83]:
new = Neighbors.merge(coor_toronto, on='PostalCode')
new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
