In [None]:
!conda install -c anaconda beautifulsoup4 
!conda install -c anaconda lxml
!conda install -c anaconda html5lib
!conda install -c anaconda requests

### Import required libraries

In [40]:
import requests
import pandas as pd
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')

### Insert Wikipedia table into variable

In [41]:
My_table = soup.find('table',{'class':'wikitable sortable'}) 

### Parse table and create dataframe according to the given conditions

In [43]:
table_rows = My_table.find_all('tr')

PostCode =[]
Borough = []
Neighbourhood = []

for tr in table_rows:
    td = tr.find_all('td')
    if (len(td) > 0 and td[1].text != 'Not assigned'): #check if it valid row and if borough!="Not Assigned"
        if td[0].text not in PostCode: #if postcode not present before then create new
            PostCode.append((td[0].text)) #append postcode in array
            a = td[1].find_all('a')
            if(len(a) > 0):
                Borough.append((((td[1].find('a')).get('title')).split(','))[0]) #append borough in array for hyperlink
            else:
                Borough.append((((td[1].text)).split(','))[0]) #append borough in array for non-hyperlink
            a = td[2].find_all('a')
            if(len(a) > 0):
                Neighbourhood.append((((td[2].find('a')).get('title')).split(','))[0]) #append neighbourhood in array for hyperlink
            else:   #append neighbourhood in array for non-hyperlink
                if td[2].text == 'Not assigned\n': 
                    Neighbourhood.append(((td[1].text).split(','))[0])#append borough if non assigned
                else:
                    Neighbourhood.append((td[2].text).replace('\n', '')) # append neighbourhood if non-hyperlink
        else: #if postcode is present in the array
            ind = PostCode.index(td[0].text)
            a = td[2].find_all('a')
            if(len(a) > 0):
                testneighbourhood = (((td[2].find('a')).get('title')).split(','))[0] #append neighbourhood in array for hyperlink
            else:   #append neighbourhood in array for non-hyperlink
                if td[2].text != 'Not assigned\n':
                    testneighbourhood = (td[2].text).replace('\n', '') # append neighbourhood if non-hyperlink
            if Borough[ind] == Neighbourhood[ind]:
                Neighbourhood[ind] = testneighbourhood
            else:
                Neighbourhood[ind] = Neighbourhood[ind] + ', ' + testneighbourhood           
        
df=pd.DataFrame()
df['PostCode'] = PostCode
df['Borough'] = Borough
df['Neighbourhood'] = Neighbourhood

## DataFrame Shape

In [44]:
df.shape

(103, 3)

### Using geocoder to import latitude and longitude

In [None]:
!conda install -c conda-forge geocoder 

### Populate latitude, longitude using geocoder

In [None]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
latitude = []
longitude = []

for index, row in df.iterrows():
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(row['PostCode']))
        lat_lng_coords = g.latlng
    latitude.append(g.latlng[0])
    longitude.append(g.latlng[1])
df['Latitude'] = latitude
df['Latitude'] = longitude

## Populate latitide, longitude using excel sheet

In [46]:
latitude = []
longitude = []
latlong = pd.read_csv('http://cocl.us/Geospatial_data')
#print((latlong.loc[latlong['Postal Code'] == 'M3A'])['Postal Code'])
for index, row in df.iterrows():
    templatlong = latlong[latlong['Postal Code'].str.match(row['PostCode'])]
    latitude.append((templatlong['Latitude'].values)[0])
    longitude.append((templatlong['Longitude'].values)[0])
df['Latitude'] = latitude
df['Longitude'] = longitude

# Clustering Toronto Data

### Creating a dataframe with borough's in Toronto

In [49]:
toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636
1,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [50]:
toronto_data_grouped = toronto_data.groupby('Neighbourhood').mean().reset_index()
toronto_data_grouped.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,"Adelaide, King, Richmond",43.650571,-79.384568
1,Berczy Park,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442


### K Means Clustering

In [51]:
# set number of clusters
from sklearn.cluster import KMeans
kclusters = 5

toronto_grouped_clustering = toronto_data_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 4, 1, 2, 2, 2, 4, 4, 2], dtype=int32)

## Insert cluster information to original dataframe

In [52]:
toronto_data['Cluster Labels']=kmeans.labels_
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Harbourfront (Toronto), Regent Park",43.65426,-79.360636,2
1,M7A,Queen's Park (Toronto),Queen's Park,43.662301,-79.389494,2
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,4
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,2


### Importing libraries for maps

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes 
import folium 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

In [55]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Plotting clusters on the map

In [57]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters