Web Scraping with BeautifulSoup

1. Import necessary libraries

In [7]:
import pandas as pd
import numpy as np
import requests as rq
from bs4 import BeautifulSoup

2. Scrape data from Wikipedia page and save under source
3. Parse data using BautifulSoup
4. Use tags to find data between table and table row tags and save it

In [8]:
source = rq.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'html.parser')
#find data between table tags
table = soup.find('table')
#identify all rows in the table
table_rows = table.find_all('tr')


5. Create new table based on data saved for each table row
6. Clean data, drop first row etc. (see explanation for each line of code
7. Print dataframe

In [9]:
new_table = []
for tr in table_rows:
    #find all table data (td) between the table row (tr) tags
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    new_table.append(row)
df = pd.DataFrame(new_table, columns = ["Postcode", "Borough", "Neighbourhood"])
#clean data (i.e. delete \n behind last word in column Neighbourhood)
df['Neighbourhood'] = df['Neighbourhood'].str[:-1]
#drop first row
df = df.dropna(axis=0)
#delete rows where column Borough has value 'Not assigned'
df = df[df.Borough != 'Not assigned']
#group rows with the same value in column 'Postcode' and concatenate the respective value in the column 'Neighbourhood'
df = df.groupby('Postcode').agg({'Borough':'min','Neighbourhood':', '.join}).reset_index()
#replace values 'Not assigned' in column 'Neighbourhood' with respective value from column 'Borough'
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


8. Import csv file

In [10]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


9. Add two columns 'Latitude' and 'Longitude' and combine two dataframes 

In [11]:
#add columns for Latitude and Longitude
df['Latitude'] = None
df['Longitude'] = None
#insert respective latitude and longitude based on postcode
df['Latitude'] = np.where(df['Postcode'] == df_coord['Postal Code'], df_coord['Latitude'], df['Latitude'])
df['Longitude'] = np.where(df['Postcode'] == df_coord['Postal Code'], df_coord['Longitude'], df['Longitude'])
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.7163,-79.2395
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6927,-79.2648


10. Import required libraries and packages

In [12]:
import random # library for random number generation

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

from IPython.display import display_html
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    altair-3.3.0               |           py36_0         747 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

11. Filter Boroughs that contain Toronto
12. Create map and show the filtered result

In [15]:
#get all the Boroughs that contain Toronto
df_tor = df[df['Borough'].str.contains('Toronto',regex=False)]

#create map
map_tor = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat, lng, borough, neighbourhood in zip(df_tor['Latitude'],df_tor['Longitude'],df_tor['Borough'],df_tor['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup=label,
    color='green',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.5,
    parse_html=False).add_to(map_tor)
map_tor

13. Use k-means for clustering the filtered boroghs
14. Create a map of clustered results

In [23]:
k=4
tor_clusters = df_tor.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(tor_clusters)
kmeans.labels_
df_tor.insert(0, 'Cluster', kmeans.labels_)

#create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

#set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Neighbourhood'], df_tor['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters