# First Let's import required libraries

In [155]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize  # transform json files to pandas dataframes
from geopy.geocoders import Nominatim # 
import numpy as np
import csv
!pip install folium
import folium

print('All modules imported')

All modules imported


# Let's start scraping the wikipedia page

In [156]:
# The wikipedia site link
site_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Get the source code html data from the website

In [157]:
source = requests.get(site_link).text

Lets Use BeautifulSoup to parse it

In [158]:
soup = BeautifulSoup(source, 'lxml')

#print(soup.prettify())

Next let's get the table that contains the data we want to scrape

In [159]:
My_table = soup.find('table',{'class':'wikitable sortable'})

Let's view the table data

In [160]:
# Uncomment below to view table
# My_table

we can see that all the data we want are between the $&lt;td&gt;$ brackets, let's get the data between the td brackets

In [161]:
links = My_table.find_all('td')

In [162]:
# uncomment below to view links
# print(links)

Next let's loop through links and extract only the text elements

In [163]:
text_links = []

for link in links:
    text_links.append(link.text)
    
# uncommnet below to view text_links    
#text_links

let's clean and process the table elements
Let's clean the links and keep only rows with Borough. Out of which we shall rename rows without Neighborhood as Boroughs

In [164]:
cleaned_links = []

while True:
    
    if len(text_links) < 3:
        break
    
    sub = text_links[:3]
    # If 'Not ' in borough then skip that row of data
    if 'Not ' in sub[1]:
        text_links = text_links[3:]
    else:
        cleaned_links.append(text_links[:3])
        
        # Let's strip off the \n at the end of each neighborhood data
        cleaned_links[-1][-1] = cleaned_links[-1][-1].strip('\n')
        
        # If the Borough is available but the Neighborhood is missing
        # make Neighborhood same as Borough
        if 'Not ' in cleaned_links[-1][-1]:
            cleaned_links[-1][-1] = cleaned_links[-1][-2]
        text_links = text_links[3:]
# Uncomment below to view cleaned_links       
#cleaned_links

lets check the length of the cleaned links

In [165]:
len(cleaned_links)

103

Next let's add the neighborhood data of each duplicate Postal Codes together to the first instance or row that contains the PostalCode

In [166]:
link = []
for i in range(len(cleaned_links)):
    x = cleaned_links[i][0]
    if x in link:
        cleaned_links[link.index(x)][-1] += ', ' + cleaned_links[i][-1]
    link.append(x)
    
# uncomment below
#cleaned_links

Next let's pass the cleaned _links to a data frame and set index to postal code so that we can easily work on it

In [167]:
df = pd.DataFrame(cleaned_links, columns=['PostalCode','Borough','Neighborhood'])
df.index= df.PostalCode

In [168]:
# Let's view the data frame
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A\n,M3A\n,North York\n,Parkwoods
M4A\n,M4A\n,North York\n,Victoria Village
M5A\n,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
M6A\n,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
M7A\n,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"


Next let's Use the pandas duplicate method to drop duplicate index

In [169]:
df = df.loc[~df.index.duplicated(keep='first')]

In [170]:
# Let's see the shape so far
df.shape

(103, 3)

In [171]:
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A\n,M3A\n,North York\n,Parkwoods
M4A\n,M4A\n,North York\n,Victoria Village
M5A\n,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
M6A\n,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
M7A\n,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"


Next let's reset the index back and drop the current index

In [173]:
df.reset_index(drop=True, inplace=True)

# Let's see the first few rows
df.head(13)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods
1,M4A\n,North York\n,Victoria Village
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"
5,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village"
6,M1B\n,Scarborough\n,"Malvern, Rouge"
7,M3B\n,North York\n,Don Mills
8,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens"
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson"


In [174]:
df.shape

(103, 3)

Appending the Latitude and Longitude data
Let's define a simple method that we can apply to each Borough to get its Latitude and Longitude using the apply() method

In [175]:
def latitude_longitude(Borough):
    """ Method takes a Series object and returns
    a list of Latitude and corresponding Longitude data,
    using the geopy library.
    This method also prints out the coordinate data"""
    
    address = Borough
    
    geolocator = Nominatim(user_agent="CA_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    return [latitude, longitude]

Let's append the list containing corresponding lat and lon data to column Latitude

In [176]:
df['Latitude'] = df.Borough.apply(latitude_longitude)

The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of Etobicoke
 are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough
 are 54.2820009, -0.4011868.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of East York
 are 43.699971000000005, -79.33251996261595.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Etobicoke
 are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough
 are 54.2820009, -0.

In [177]:
# Lets see the updated data with Latitude containing lists of lats and lons data

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude
0,M3A\n,North York\n,Parkwoods,"[43.7543263, -79.44911696639593]"
1,M4A\n,North York\n,Victoria Village,"[43.7543263, -79.44911696639593]"
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront","[43.6541737, -79.38081164513409]"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights","[43.7543263, -79.44911696639593]"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government","[43.6541737, -79.38081164513409]"


Next let's loop through the data frame and separate Latitude from Longitude and make lat and lons just numbers not lists

In [178]:
lon_list = []
for i, j in df.iterrows():
    lon_list.append(j.Latitude[1])
    j.Latitude = j.Latitude[0]
    
# next let's assign the lon_list as the value of the Longitude Column

df['Longitude'] = lon_list

In [179]:
# let's view the changes

df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A\n,North York\n,Parkwoods,43.7543,-79.449117
1,M4A\n,North York\n,Victoria Village,43.7543,-79.449117
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront",43.6542,-79.380812
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights",43.7543,-79.449117
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
...,...,...,...,...,...
95,M1X\n,Scarborough\n,Upper Rouge,54.282,-0.401187
96,M4X\n,Downtown Toronto\n,"St. James Town, Cabbagetown",43.6542,-79.380812
97,M5X\n,Downtown Toronto\n,"First Canadian Place, Underground city",43.6542,-79.380812
98,M8X\n,Etobicoke\n,"The Kingsway, Montgomery Road, Old Mill North",43.6436,-79.565633


In [180]:
df.shape

(103, 5)