In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import KMeans

%matplotlib inline


In [2]:
url_wp = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wpage = requests.get(url_wp)
soup = BeautifulSoup(wpage.content, "html.parser")

In [3]:
rows = soup.find("table", class_="wikitable").find_all("tr")

headF = False

postal = []
bor = []
nei = []

for i in rows:
    
    if headF == False:
        headF = True
        continue
    
    row = i.find_all("td")
    
    c0 = row[0].get_text().rstrip()
    c1 = row[1].get_text().rstrip()
    c2 = row[2].get_text().rstrip()
    
    if c1.lower() == "not assigned":
        continue
    else:
        
        if c2.lower() == "not assigned":
            c2 = c1

        postal.append(c0)
        bor.append(c1)
        nei.append(c2)
        
        if "/" in nei[-1]:
            nei[-1] = nei[-1].replace(" / ",", ")

In [4]:
df = pd.DataFrame([postal,bor,nei]).transpose()
df.columns = ["PostalCode","Borough","Neighborhood"]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [8]:
gf = pd.read_csv("http://cocl.us/Geospatial_data")
gf.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
df2 = pd.concat([df,pd.DataFrame(np.zeros(df.shape[0])),pd.DataFrame(np.zeros(df.shape[0]))],axis=1)
names = list(df.columns)
names = names + ["Latitude","Longitude"]
df2.columns = names
df2.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,0.0,0.0
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0.0,0.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0.0,0.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",0.0,0.0


In [9]:
for i in range(df2.shape[0]):
    
    pc = df2.loc[i]["PostalCode"]
    lat = gf.loc[gf["Postal Code"] == pc]["Latitude"].values[0]
    long = gf.loc[gf["Postal Code"] == pc]["Longitude"].values[0]

    df2.loc[i,"Latitude"] = lat
    df2.loc[i,"Longitude"] = long
    
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [10]:
print(df2["Latitude"].min())
print(df2["Longitude"].max())

43.60241370000001
-79.16049709999999


In [11]:
lat_t = 43 + 44/60 + 30/3600
long_t = -1 * (79 + 22/60 + 24/3600)
print(lat_t,long_t)

43.74166666666667 -79.37333333333332


In [14]:
dfll = pd.concat([df2["Latitude"],df2["Longitude"]],axis=1)
dfll.head(5)

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [15]:
kclusters = 5


kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfll)

clusters = kmeans.predict(dfll)
cltCenters = kmeans.cluster_centers_

In [16]:
df3 = pd.concat([df2,pd.DataFrame(clusters,columns=["cluster"])],axis=1)
df3.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,4
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2
