# Code to scrape the Wikipedia page of the Neighborhoods of Toronto

First I import Libraries

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                       

## Getting the HTML code of the wikipedia

Using requests library, I stored the html code of the Toronto Postal Code wikipedia website in text format in 'source' and then created an instance of the BeautifulSoup object as 'soup' to extract the html code's contents like tables, titles, etc.

In [14]:
from bs4 import BeautifulSoup

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
#print(source)

soup  = BeautifulSoup(source,'lxml')
#print(soup.prettify()) # using prettify method to show the code in idented format.

In [12]:
# Playing around a little bit, extracting title
title = soup.title.text
print(title)

List of postal codes of Canada: M - Wikipedia


In [16]:
tables = soup.find('table', class_ = "wikitable sortable")
#print(tables.prettify())

The Table body is organized in table_rows and each row is split into either table_head or table_data .
I will first extract all rows and view them in text format. Use of '.text' removes removes '<', '/' and '>'.

In [17]:
for row in tables.find_all('tr'):
    print(row.text)


Postal code

Borough

Neighborhood


M1A

Not assigned




M2A

Not assigned




M3A

North York

Parkwoods


M4A

North York

Victoria Village


M5A

Downtown Toronto

Regent Park / Harbourfront


M6A

North York

Lawrence Manor / Lawrence Heights


M7A

Downtown Toronto

Queen's Park / Ontario Provincial Government


M8A

Not assigned




M9A

Etobicoke

Islington Avenue


M1B

Scarborough

Malvern / Rouge


M2B

Not assigned




M3B

North York

Don Mills


M4B

East York

Parkview Hill / Woodbine Gardens


M5B

Downtown Toronto

Garden District, Ryerson


M6B

North York

Glencairn


M7B

Not assigned




M8B

Not assigned




M9B

Etobicoke

West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale


M1C

Scarborough

Rouge Hill / Port Union / Highland Creek


M2C

Not assigned




M3C

North York

Don Mills


M4C

East York

Woodbine Heights


M5C

Downtown Toronto

St. James Town


M6C

York

Humewood-Cedarvale


M7C

Not assigned




M8C

Not assigned




M9C


### Listing the rows
The following code will make a list of all the elements of a row.....

In [18]:
for tr in tables.find_all('tr'):
    data = (tr.find_all('td'))
    row1 = [i.text for i in data]
    print(row1)

[]
['M1A\n', 'Not assigned\n', '\n']
['M2A\n', 'Not assigned\n', '\n']
['M3A\n', 'North York\n', 'Parkwoods\n']
['M4A\n', 'North York\n', 'Victoria Village\n']
['M5A\n', 'Downtown Toronto\n', 'Regent Park / Harbourfront\n']
['M6A\n', 'North York\n', 'Lawrence Manor / Lawrence Heights\n']
['M7A\n', 'Downtown Toronto\n', "Queen's Park / Ontario Provincial Government\n"]
['M8A\n', 'Not assigned\n', '\n']
['M9A\n', 'Etobicoke\n', 'Islington Avenue\n']
['M1B\n', 'Scarborough\n', 'Malvern / Rouge\n']
['M2B\n', 'Not assigned\n', '\n']
['M3B\n', 'North York\n', 'Don Mills\n']
['M4B\n', 'East York\n', 'Parkview Hill / Woodbine Gardens\n']
['M5B\n', 'Downtown Toronto\n', 'Garden District, Ryerson\n']
['M6B\n', 'North York\n', 'Glencairn\n']
['M7B\n', 'Not assigned\n', '\n']
['M8B\n', 'Not assigned\n', '\n']
['M9B\n', 'Etobicoke\n', 'West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale\n']
['M1C\n', 'Scarborough\n', 'Rouge Hill / Port Union / Highland Creek\n']
['M2C\n', 'No

### Creating the DataFrame
...but we need to make an array of the lists above so we can make a dataframe out of it. We do that below.

In [19]:
row= []
for tr in tables.find_all('tr'):
    data = tr.find_all('td')
    row.append([i.text for i in data])
       

df = pd.DataFrame( data  = row )
df.head()

Unnamed: 0,0,1,2
0,,,
1,M1A\n,Not assigned\n,\n
2,M2A\n,Not assigned\n,\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


### Adding Header row
In the previous code, we looped only through the rows and used an empty list to append it. Hence we need to add the column names as Headers provided by the html code as below.

In [77]:
for tr in tables.find_all('tr'):
    data = tr.find_all('td')
    row.append([i.text.strip() for i in data]) # Usint strip( to remove the \n from last column)
      
labels = tables.find_all('th')
labels = [c.text for c in labels]
labels = [i.strip() for i in labels] ### removing '\n' from the column labels
#print(labels)
df = pd.DataFrame( data  = row , columns = labels)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Cleaning the dataframe
Now we will start cleaning the dataframe, let's start by remove the empty top row and then resetting the index.

In [78]:
df1 = df.drop([0])
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront


In [79]:
df1 = df1.reset_index(drop = True)
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Removing all the 'Not assigned' values from the Borough column

In [80]:
df1 = df1[df1.Borough != 'Not assigned']
df1 = df1.reset_index(drop = True)
df1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
