In [146]:
#!pip install geocoder
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c conda-forge basemap
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
import urllib.request
import time
import itertools
import geocoder
import matplotlib.pyplot as plt
import folium
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from pylab import rcParams
from sklearn.cluster import KMeans
%matplotlib inline



In [83]:
# scraping the Wiki website

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
#Store the contents of the website under 'content'
content = lh.fromstring(response.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = content.xpath('//tr')

#Create empty list
col=[]
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    name=t.text_content()
    col.append((str.rstrip(name),[]))
    
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    #i is the index of our column
    i=0    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        if i == 0 and len(data) !=(4):
            break
        col[i][1].append(str.rstrip(data))
        i+=1
        
#convert to pandas dataframe
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

# drop "Not assigned" borough postal codes
df = df[~df['Borough'].isin(['Not assigned'])]
# reset index
df.reset_index(inplace=True)
df.drop(['index'], axis=1, inplace=True)

# check shape
df.shape

(103, 3)

In [128]:
# adding coordinates from CSV file (going the safe way considering the I get None from geolocation most of the time)
!wget -q -O geo_data 'http://cocl.us/Geospatial_data'
file = pd.read_csv('geo_data')
result = pd.merge(df, file, on='Postal Code')
df=result
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [130]:
df.shape

(103, 5)

In [139]:
# drawing a map with all neighborhoods in Toronto:
map_toronto = folium.Map(location=[43.65, -79.38], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='red',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [156]:
# run k-means clustering with 4 clusters depending on latitude and longitude
df_latlon = df[['Latitude', 'Longitude']]

kmeans = KMeans(n_clusters=4, random_state=0).fit(df_latlon)

#insert Clusters into the df
df.insert(0, 'Cluster Labels', kmeans.labels_)

array([3, 3, 0, 2, 0, 1, 3, 2, 0, 0, 2, 1, 3, 0, 0, 0, 2, 1, 3, 0, 0, 2,
       3, 0, 0, 0, 3, 2, 2, 0, 0, 0, 3, 2, 2, 0, 0, 0, 3, 2, 2, 0, 0, 0,
       3, 2, 1, 0, 0, 1, 1, 3, 2, 1, 0, 2, 1, 1, 3, 2, 1, 2, 2, 1, 1, 3,
       2, 2, 2, 1, 1, 3, 2, 2, 0, 1, 1, 1, 3, 0, 0, 1, 3, 0, 0, 3, 0, 0,
       1, 1, 3, 0, 0, 1, 1, 3, 0, 0, 1, 0, 0, 1, 1], dtype=int32)

In [165]:
# drawing a map with all showing the clusters in Toronto:
map_toronto_cluster = folium.Map(location=[43.65, -79.38], zoom_start=12)

# add markers to map
for lat, lng, label, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster Labels']):
    label = folium.Popup(label, parse_html=True)
    if cluster == 0:
        color = 'Red'
    elif cluster == 1:
        color = 'Blue'
    elif cluster == 2:
        color = 'Green'
    elif cluster == 3:
        color = 'Black'        
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.9,
        parse_html=False).add_to(map_toronto_cluster)  
    
map_toronto_cluster