Importing libraries

In [47]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import lxml
import pandas as pd

Scrape data from Wikipedia

In [28]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


The tabular data is availabe in table and belongs to class="wikitable sortable"So let's extract only table

In [32]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(website_url,'html.parser')

In [34]:
neighborhood = soup.find('table', class_ = 'wikitable')
neighborhood_rows = neighborhood.find_all('tr')

Extract the columns 'Postcode', 'Borough', 'Neighbourhood' from the table

In [35]:
information = []
for row in neighborhood_rows:
    info = row.text.split('\n')[1:-1] # remove empty str (the first and last items)
    information.append(info)
    
information[0:20] #preview the first 20 rows

[['Postal Code', '', 'Borough', '', 'Neighborhood'],
 ['M1A', '', 'Not assigned', '', 'Not assigned'],
 ['M2A', '', 'Not assigned', '', 'Not assigned'],
 ['M3A', '', 'North York', '', 'Parkwoods'],
 ['M4A', '', 'North York', '', 'Victoria Village'],
 ['M5A', '', 'Downtown Toronto', '', 'Regent Park, Harbourfront'],
 ['M6A', '', 'North York', '', 'Lawrence Manor, Lawrence Heights'],
 ['M7A',
  '',
  'Downtown Toronto',
  '',
  "Queen's Park, Ontario Provincial Government"],
 ['M8A', '', 'Not assigned', '', 'Not assigned'],
 ['M9A', '', 'Etobicoke', '', 'Islington Avenue, Humber Valley Village'],
 ['M1B', '', 'Scarborough', '', 'Malvern, Rouge'],
 ['M2B', '', 'Not assigned', '', 'Not assigned'],
 ['M3B', '', 'North York', '', 'Don Mills'],
 ['M4B', '', 'East York', '', 'Parkview Hill, Woodbine Gardens'],
 ['M5B', '', 'Downtown Toronto', '', 'Garden District, Ryerson'],
 ['M6B', '', 'North York', '', 'Glencairn'],
 ['M7B', '', 'Not assigned', '', 'Not assigned'],
 ['M8B', '', 'Not assigne

Turn above information table into a Pandas dataframe


In [38]:
neighbor_df = pd.DataFrame(information[1:], columns=information[0])
# where information[1:] contains each row of neighborhoods
# and columns = information[0] gives the column names


# Re-spell Neighbourhood as Neighborhood
#neighbor_df = neighbor_df.rename(columns={neighbor_df.columns[2]: "Neighborhood" })


neighbor_df.head(20)

Unnamed: 0,Postal Code,Unnamed: 2,Borough,Unnamed: 4,Neighborhood
0,M1A,,Not assigned,,Not assigned
1,M2A,,Not assigned,,Not assigned
2,M3A,,North York,,Parkwoods
3,M4A,,North York,,Victoria Village
4,M5A,,Downtown Toronto,,"Regent Park, Harbourfront"
5,M6A,,North York,,"Lawrence Manor, Lawrence Heights"
6,M7A,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government"
7,M8A,,Not assigned,,Not assigned
8,M9A,,Etobicoke,,"Islington Avenue, Humber Valley Village"
9,M1B,,Scarborough,,"Malvern, Rouge"


Cleaning acc to conditions

In [39]:
neighbor_df = neighbor_df[neighbor_df.Borough != 'Not assigned']

neighbor_df.reset_index(drop=True, inplace=True)

neighbor_df.head(20)


Unnamed: 0,Postal Code,Unnamed: 2,Borough,Unnamed: 4,Neighborhood
0,M3A,,North York,,Parkwoods
1,M4A,,North York,,Victoria Village
2,M5A,,Downtown Toronto,,"Regent Park, Harbourfront"
3,M6A,,North York,,"Lawrence Manor, Lawrence Heights"
4,M7A,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government"
5,M9A,,Etobicoke,,"Islington Avenue, Humber Valley Village"
6,M1B,,Scarborough,,"Malvern, Rouge"
7,M3B,,North York,,Don Mills
8,M4B,,East York,,"Parkview Hill, Woodbine Gardens"
9,M5B,,Downtown Toronto,,"Garden District, Ryerson"


Combine neighborhoods with the same postcode


In [46]:
grouped = neighbor_df.groupby(['Postal Code']) # group by Postcode


# combine the neighborhoods grouped by postcode and into a new df
neighborhood_grouped = grouped['Neighborhood'].apply(lambda x: x.sum()) 
# adds spaces and commas between neighborhoods
neighborhood_grouped = grouped['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
# matches a borough to each postcode
borough_grouped = grouped['Borough'].apply(lambda x: set(x).pop())
# turn borough_grouped and neighborhood_grouped into dataframes
borough = borough_grouped.to_frame()
neighborhood = neighborhood_grouped.to_frame()
#combine the dataframe borough and the dataframe neighborhood into one dataframe
grouped_final = borough.merge(neighborhood, on="Postal Code")

grouped_final

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


Print the .shape method to print the number of rows of the dataframe


In [50]:
print('The number of rows and columns in this final grouped dataframe is',grouped_final.shape)

The number of rows and columns in this final grouped dataframe is (103, 2)
