Web Scraping with BeautifulSoup

1. Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import requests as rq
from bs4 import BeautifulSoup

2. Scrape data from Wikipedia page and save under source
3. Parse data using BautifulSoup
4. Use tags to find data between table and table row tags and save it

In [2]:
source = rq.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'html.parser')
#find data between table tags
table = soup.find('table')
#identify all rows in the table
table_rows = table.find_all('tr')


5. Create new table based on data saved for each table row
6. Clean data, drop first row etc. (see explanation for each line of code
7. Print dataframe

In [3]:
new_table = []
for tr in table_rows:
    #find all table data (td) between the table row (tr) tags
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    new_table.append(row)
df = pd.DataFrame(new_table, columns = ["Postcode", "Borough", "Neighbourhood"])
#clean data (i.e. delete \n behind last word in column Neighbourhood)
df['Neighbourhood'] = df['Neighbourhood'].str[:-1]
#drop first row
df = df.dropna(axis=0)
#delete rows where column Borough has value 'Not assigned'
df = df[df.Borough != 'Not assigned']
#group rows with the same value in column 'Postcode' and concatenate the respective value in the column 'Neighbourhood'
df = df.groupby('Postcode').agg({'Borough':'min','Neighbourhood':', '.join}).reset_index()
#replace values 'Not assigned' in column 'Neighbourhood' with respective value from column 'Borough'
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
#add columns for Latitude and Longitude
df['Latitude'] = None
df['Longitude'] = None

8. Import csv file

In [4]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


9. Combine two dataframes 

In [6]:
#insert respective latitude and longitude based on postcode
df['Latitude'] = np.where(df['Postcode'] == df_coord['Postal Code'], df_coord['Latitude'], df['Latitude'])
df['Longitude'] = np.where(df['Postcode'] == df_coord['Postal Code'], df_coord['Longitude'], df['Longitude'])
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.7163,-79.2395
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6927,-79.2648
