# Notebook for scraping the Toronto Neighbourhoods wikipedia page, cleaning the data and adding coordinates.

In [116]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import geocoder

#### Getting the wikipedia page as text, reading the tables to a variable and then saving the specific table to a csv file.

In [117]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

wikitable = pd.read_html(url, index_col=0, attrs={"class":"wikitable"})

wikitable[0].to_csv('Toronto_Neighbourhoods.csv')

#### Reading the csv file, setting the columns to the right names and dropping the 'first' row that contained the columns names.

In [118]:
df = pd.read_csv('Toronto_Neighbourhoods.csv')
df.columns = df.iloc[0]
df.drop(0, axis = 0, inplace = True)
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
285,M8Z,Etobicoke,Mimico NW
286,M8Z,Etobicoke,The Queensway West
287,M8Z,Etobicoke,Royal York South West
288,M8Z,Etobicoke,South of Bloor
289,M9Z,Not assigned,Not assigned


In [119]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object', name=0)

#### After getting the columns names replace the Boroughs with the "Not assigned" value with NaNs and then dropping them.

In [120]:
columns = ['Postcode', 'Borough', 'Neighbourhood']
df = df.replace({'Borough': 'Not assigned'}, pd.np.nan).dropna(axis=0, how='any', subset=columns)

In [121]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


#### Now replacing the Neighbourhood with a "Not assigned" value with the respective Borough value (as in Queen's Park).

In [122]:
df.Neighbourhood.replace('Not assigned',df.Borough,inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


#### Aggregating the Neighbourhoods

In [123]:
df = df.groupby(['Postcode','Borough'], sort=False).agg(', '.join)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park


#### Reseting the index numbers

In [124]:
df = df.reset_index()
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [125]:
df.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
102,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


In [126]:
df.shape

(103, 3)

# Adding coordinates to the dataframe

In [127]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [128]:
coords.rename(index=str, columns={'Postal Code': 'Postcode'}, inplace = True)
coords.head(2)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497


In [129]:
df_with_coords = pd.merge(df, coords, how='outer', on ='Postcode')
df_with_coords.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
