# Exploration and clustering of Totonto neighbourhoods  

<h1>Table of contents</h1>

<div class="alert alert-block alert-info" style="margin-top: 20px">
    <ol>
        <li><a href="#ref1">Data Collection</a></li>
        <li><a href="#ref2">Data Exploration</a></li>
        <li><a href="#ref3">Clustering</a></li>
    </ol>
</div>
<br>
<hr>  

<h2>
<div class="alert alert-danger"" style="margin-top: 20px">
          If the map could NOT be opened, Please view the notebook via Jupyter nbviewer by <br> <a href="https://nbviewer.jupyter.org/github/Abdurahman-Amat/Coursera_Capstone/blob/master/Explore_Cluster_Toronto_Neighbourhood.ipynb" target="_blank">clicking here </a>, thank you!!!.
</div>
</h2>


<hr>

<a id="ref1"></a>
# Data Collection  

### Important necessary libraries to collect necessary dataset by doing web scraping and prepare the expected DataFrame

In [39]:
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
#!pip install pgeocode
import pgeocode
from sty import fg, bg, ef, rs
import folium # map rendering library
# import k-means from clustering stage
from sklearn.cluster import KMeans

print('\nLibraries are imported.')


Libraries are imported.


### Perform a little web scraping to get the data  
I will use BeautifulSoup to scrape the given website,
and save the scraped row data into a CSV file

In [40]:
# To get source code of the website
main_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(main_url)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text)
# print(soup.prettify())
# Noticed that expected information is located within 'table' tag
# Therefore make my soup only with the table
soup = soup.find('table')
#
# Open the CSV file
csv_file = open('Toronto_Postcode_Borough_Neighbourhood.csv', 'w')
# Create the CSV writer object to write contents into CSV file
csv_writer = csv.writer(csv_file)
for i, soup in enumerate(soup.find_all('tr')):
    # print(soup.prettify())
    soup_splitted = list(soup.stripped_strings)
     # ignore those cells with 'Not assigned' entries for Borough as required in the assignment
    if soup_splitted[1]!='Not assigned':
        # if 'Neighbourhood' is 'Not assigned', then assign the 'Borough' value to it
        if soup_splitted[2]=='Not assigned':
            csv_writer.writerow([soup_splitted[0], soup_splitted[1], soup_splitted[1]])
        else:
            csv_writer.writerow([soup_splitted[0], soup_splitted[1], soup_splitted[2]])
    else:
        pass
# Close the CSV file
csv_file.close()

### Create the Pandas dataframe from above scraped row data

In [41]:
# Define the dataframe from the above collected 'csv' file
df = pd.read_csv('Toronto_Postcode_Borough_Neighbourhood.csv')
# Look at first few entries
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


Check how many entries we have

In [42]:
df.shape

(210, 3)

In [43]:
df[df['Postcode']=='M5A']

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M5A,Downtown Toronto,Harbourfront


In [44]:
'''
This function will emerge all Neighbourhoods for same Postcodes
leav unchanged if there is no repeated Postcodes for Neighbourhoods
'''
def merge_Neighbourhoods(df):
    # find repeated Postcode and number of occurances
    unique, counts = np.unique(df.iloc[:, 0].values, return_counts=True)
    print(f'There are {len(unique)} of unique Postcodes while {df.shape[0]} of Neighbourhoods.\n So there are many Neighbourhoods are having the same Postcodes.')
    # find the index of Postcodes which are occured more than once
    repeat_index = np.argwhere(counts>1)
    # Postcodes occured more than once
    pc_list = unique[repeat_index][:,0]
    Ent_list = []
    # loop through all unique postal codes to check
    # if there is any multiple neighbourhoods for the same postal code
    # if so, then join them as required in the assignment instruciton
    for pc in unique: 
        df_pc = df[df['Postcode']==pc]
        # If more than one neighbourhoods for same postal code
        if pc in pc_list:
            # Then join all neighbourhood names
            df_pc_join = ', '.join(df_pc.iloc[:, 2].to_list())
            entry = [df_pc.iloc[0, :3][0], df_pc.iloc[0, :3][1], df_pc_join]
            Ent_list.append(entry)
        # If single neighbourhood for the given postal code
        else:
            # Then everything is untouched
            entry = [df_pc.iloc[0, :3][0], df_pc.iloc[0, :3][1], df_pc.iloc[0, :3][2]]
            Ent_list.append(entry)
    return Ent_list

Apply above prepared function

In [45]:
new_data_list = merge_Neighbourhoods(df)

There are 103 of unique Postcodes while 210 of Neighbourhoods.
 So there are many Neighbourhoods are having the same Postcodes.


Check if the function worked as expected

In [46]:
np.shape(new_data_list)

(103, 3)

So, it seems the function has done his job  

###  Now prepare wanted DataFrame

In [47]:
new_df = pd.DataFrame(new_data_list, columns =['Postcode', 'Borough', 'Neighbourhood']) 

Check the data frame

In [48]:
new_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [49]:
new_df[new_df['Postcode']=='M5A']

Unnamed: 0,Postcode,Borough,Neighbourhood
53,M5A,Downtown Toronto,Harbourfront


### **Noticed** *that there is only single Neighbourhood for the Postcode 'MA5', while in the assignment instruction it is mentioned  that there are two Meighbourhoods for 'M5A'. It is a possibility that the postal codes are update since the preparation of the course material. Or ...?*

### Shape of the dataset

In [50]:
new_df.shape

(103, 3)

<hr>

### Prepare the location dataset 

In [51]:
# #!pip install geocoder
# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format('M1E'))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]
# print(latitude, longitude)

Above method to collect location data of Toronto Neighbourhoods didn't work even for waiting long time (few minutes).  

### Then tried other method via the library **pgeocode**

In [52]:
nomi = pgeocode.Nominatim('ca') # 'ca' for Canada
for p_code in new_df['Postcode']: # for all Neighbourhoods by their postal codes
    locinfo = nomi.query_postal_code(p_code)
    #!pip install sty # umcomment when sty is not already installed
    print(f'Postcode is '+fg.red + f'{p_code}' + fg.rs+', latitude is '+fg.red + f'{locinfo[9]:.5f}' + fg.rs+', longitude is '+fg.red + f'{locinfo[10]:.5f}' + fg.rs)

Postcode is [31mM1B[39m, latitude is [31m43.81130[39m, longitude is [31m-79.19300[39m
Postcode is [31mM1C[39m, latitude is [31m43.78780[39m, longitude is [31m-79.15640[39m
Postcode is [31mM1E[39m, latitude is [31m43.76780[39m, longitude is [31m-79.18660[39m
Postcode is [31mM1G[39m, latitude is [31m43.77120[39m, longitude is [31m-79.21440[39m
Postcode is [31mM1H[39m, latitude is [31m43.76860[39m, longitude is [31m-79.23890[39m
Postcode is [31mM1J[39m, latitude is [31m43.74640[39m, longitude is [31m-79.23230[39m
Postcode is [31mM1K[39m, latitude is [31m43.72980[39m, longitude is [31m-79.26390[39m
Postcode is [31mM1L[39m, latitude is [31m43.71220[39m, longitude is [31m-79.28430[39m
Postcode is [31mM1M[39m, latitude is [31m43.72470[39m, longitude is [31m-79.23120[39m
Postcode is [31mM1N[39m, latitude is [31m43.69520[39m, longitude is [31m-79.26460[39m
Postcode is [31mM1P[39m, latitude is [31m43.76120[39m, longitude is [31m-79

It is very fast and correct as given in the additional location data 'Geospatial_Coordinates.csv',  
However, it could not found latitude and longitude for the neighbourhood with postcode 'M7R'. In addition to this,  
the location data has different precisions, e.g , both latitude and longitude in provided location data table have upto six digit of precision while the queried data via **pgeocode** have only upto 4 digits and most of the location data were different after the first or second digits.  *Therefore I wanna use the provided location table.*

### Load the provided location data and read it into Pandas DataFrame

In [53]:
loc_df = pd.read_csv(r'C:\pythonwork\data\Geospatial_Coordinates.csv')
loc_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
# check if we have the full location info
loc_df.shape 

(103, 3)

In [55]:
# just to quick ckeck to make sure there is no missing values which may couse errors
# in upcomming procedures of this project
loc_df.isna().sum()

Postal Code    0
Latitude       0
Longitude      0
dtype: int64

There is no missing values, that is good.

In [56]:
# Rename the column to make it easier to merge this location data with above neighbourhood data
loc_df.rename(columns={"Postal Code":"Postcode"}, inplace=True)

In [57]:
loc_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


 Merge above two datasets to get expected dataset

In [58]:
neigh_loc_df = pd.merge(new_df, loc_df, on='Postcode')
neigh_loc_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Check the dataset again to make sure the merging is done correctly as expected.

In [59]:
neigh_loc_df.shape

(103, 5)

### So, it seems everything went well.

<hr>  

<a id="ref2"></a>
# Data Exploration  

Look at how many Boroughs in the dataset

In [60]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neigh_loc_df['Borough'].unique()),
        neigh_loc_df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


Check look at Borough column and filter those which has the word Toronto in Borough names. 

In [61]:
unique_boroughs = neigh_loc_df['Borough'].unique()
# unique Boroughs are
print(unique_boroughs)

['Scarborough' 'North York' 'East York' 'East Toronto' 'Central Toronto'
 'Downtown Toronto' 'York' 'West Toronto' 'Mississauga' 'Etobicoke'
 "Queen's Park"]


In [62]:
b_Toronto = []
for b in unique_boroughs:
    if len(b.split(' '))>1:
        if 'Toronto' in b.split(' '):
            b_Toronto.append(b)
        else:
            pass
    else:
        pass
print(f'Boroughs with the word Toronto are : {b_Toronto}')

Boroughs with the word Toronto are : ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']


In [63]:
filter_Toronto = neigh_loc_df['Borough'].isin(b_Toronto)

In [64]:
Toronto_df = neigh_loc_df[filter_Toronto].reset_index(drop=True)
Toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [65]:
Toronto_df.shape

(39, 5)

In [66]:
# Geolocation of Toronto
Toronto_lat, Toronto_long = 43.651070, -79.347015

Plot the map of Toronto with those marked Boroughs which have the name with the word Toronto

In [67]:
# create map of toronto using latitude and longitude values
map_Toronto = folium.Map(location=[Toronto_lat, Toronto_long], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

<hr>

<a id="ref3"></a>
# Clastering  

In [68]:
# set number of clusters, since there are four Boroughs with the names including the word Toronto
kclusters = 4
# K-means clustering does not work well with categorical features, so drop them
Toronto_clustering = Toronto_df.drop(['Postcode', 'Borough', 'Neighbourhood'], axis=1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_clustering)

# check cluster labels generated for each row in the dataframe
len(kmeans.labels_)#[0:10] 

39

Add claster labels

In [69]:
Toronto_clustering.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_clustering.head()

Unnamed: 0,Cluster Labels,Latitude,Longitude
0,1,43.676357,-79.293031
1,1,43.679557,-79.352188
2,1,43.668999,-79.315572
3,1,43.659526,-79.340923
4,2,43.72802,-79.38879


In [70]:
# Merge Toronto_claustering and Toronto_df  
Toronto_clustered = pd.merge(Toronto_df, Toronto_clustering, left_on=["Latitude", "Longitude"], right_on=["Latitude", "Longitude"])
Toronto_clustered.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2


In [71]:
Toronto_clustered.shape

(39, 6)

### Look at each cluster and classified Boroughs to them by the model

In [72]:
# cluster label: 0
Toronto_clustered[Toronto_clustered['Cluster Labels']==0]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
30,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0
31,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,0
32,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,0
33,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,0
34,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,0
35,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,0
36,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,0


In [73]:
# cluster label: 1
Toronto_clustered[Toronto_clustered['Cluster Labels']==1]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1
38,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,1


In [74]:
# cluster label: 2
Toronto_clustered[Toronto_clustered['Cluster Labels']==2]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,2
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,2
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2
22,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2
23,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,2


In [75]:
# cluster label: 3
Toronto_clustered[Toronto_clustered['Cluster Labels']==3]

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
10,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3
11,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,3
12,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,3
13,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,3
14,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,3
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,3
16,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,3
17,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,3
18,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,3
19,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,3


### Plot the clastering result on the map of Toronto

In [76]:
# create map of Toronto using latitude and longitude values
map_Toronto_cluster = folium.Map(location=[Toronto_lat, Toronto_long], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.gist_rainbow(np.linspace(0, 1, len(ys)))
gist_rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_clustered['Latitude'], Toronto_clustered['Longitude'], Toronto_clustered['Neighbourhood'], Toronto_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=gist_rainbow[cluster-1],
        fill=True,
        fill_color=gist_rainbow[cluster-1],
        fill_opacity=1.0).add_to(map_Toronto_cluster)
    
# 
legend_html =   '''
                <div style="position: fixed; 
                            bottom: 100px; right: 50px; width: 300px; height: 150px; 
                            border:2px solid grey; z-index:9999; font-size:14px;
                            ">&nbsp; Boroughs <br>
                              <br>
                              &nbsp; Cluster 0 : West Toronto (6/7)&nbsp; <i class="fa fa-circle fa-1x" style="color:#ff00bf"></i><br>
                              &nbsp; Cluster 1 : East Toronto (5/5) &nbsp; <i class="fa fa-circle fa-1x" style="color:#ff0029"></i><br>
                              &nbsp; Cluster 2 : Central Toronto (8/8) &nbsp; <i class="fa fa-circle fa-1x" style="color:#5cff00"></i><br>
                              &nbsp; Cluster 3 : Downtown Toronto (18/19) &nbsp; <i class="fa fa-circle fa-1x" style="color:#008fff"></i>
                              
                              
                </div>
                ''' 
map_Toronto_cluster.get_root().html.add_child(folium.Element(legend_html))
       
map_Toronto_cluster


<hr>

The model gave almost perfect result. 

<hr>