In [100]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

Below we get the HTML with requests, and cut all of the HTML around the first table out of the variable.
We're assuming that the table we want is the first table on the page

In [101]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())
soup = str(soup)
start = soup.find('<tbody')
end = soup.find('</tbody>')
table = soup[start:end]

We split the rows of the table text into a list and coerce this into a dataframe.
We're assuming the table has a header row, and that each table row is denoted by a <tr> tag, as well as each cell in a row being denoted by a <td> tag.

In [102]:
dataframe = []
table = table[table.find('</th>'):]
table = table[table.find('</tr>')+4:]
append = table.split('<tr>')
dataframe = [i.split('<td>') for i in append]



df = pd.DataFrame(dataframe)        

We need to clean up the data frame a little. We're just finding/replacing and slicing out a bunch of the HTML surrounding dataframe values

In [103]:
df.rename(columns={1: 'PostalCode', 2:'Borough', 3:'Neighborhood'}, inplace = True)
df.drop(columns=0, inplace=True)

df['PostalCode'] = df['PostalCode'].str[:3]

df.drop([0], inplace = True)

for i, j in df.iterrows():
    j.Borough = j.Borough.replace('</a>', '')
    j.Borough = j.Borough.replace('</td>', '')
    j.Borough = j.Borough.replace('\n', '')
    if j.Borough.find('<a') > -1:
        j.Borough = j.Borough[j.Borough.find('>')+1:]
    j.Neighborhood = j.Neighborhood.replace('</td></tr>', '')
    j.Neighborhood = j.Neighborhood.replace('</a>', '')
    j.Neighborhood = j.Neighborhood.replace('\n', '')
    if j.Neighborhood.find('<a') > -1:
        j.Neighborhood = j.Neighborhood[j.Neighborhood.find('>')+1:]
    
df = df.loc[df['Borough'] != 'Not assigned']
df.reset_index()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,Harbourfront
3,6,M5A,Downtown Toronto,Regent Park
4,7,M6A,North York,Lawrence Heights
5,8,M6A,North York,Lawrence Manor
6,9,M7A,Queen's Park,Not assigned
7,11,M9A,Etobicoke,Islington Avenue
8,12,M1B,Scarborough,Rouge
9,13,M1B,Scarborough,Malvern


To consolidate duplicates, we'll group by postal code and borough, and aggregate the neighborhood values.

In [104]:
df_grouped = pd.DataFrame(df.groupby(['PostalCode', 'Borough'])['Neighborhood'].aggregate(lambda x: ', '.join(x)))
df_grouped.reset_index(inplace=True)
print(df_grouped.shape)
df_grouped[:50]

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [105]:
lat_lng_csv = requests.get('http://cocl.us/Geospatial_data').text
lat_lng = lat_lng_csv.split('\r')
lat_lng = [i.split(',') for i in lat_lng]
postal_codes = pd.DataFrame(lat_lng)
postal_codes.rename(columns={0:'PostalCode', 1:'Latitude', 2:'Longitude'}, inplace = True)
postal_codes.drop([0], inplace=True)
for i, j in postal_codes.iterrows():
    j.PostalCode = j.PostalCode[1:]

df_grouped = df_grouped.merge(postal_codes, how = 'left', on = 'PostalCode')
df_grouped[:5]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.2648481
