## Segmenting and Clustering Neighborhoods in Toronto  - Part 1

In [23]:
## Import the required libraries

import pandas as pd
import numpy as np

In [5]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641'


In [6]:
## We can use read_html if just need to scrape the table. There is no need of using Beautiful Soup and parsing for this.

dfs = pd.read_html(url)
inputdf=dfs[0]  ## first table of interest

## Providing new column names
inputdf.columns=['PostalCode','Borough','Neighborhood']

## Removing the Borough with 'Not assigned'
inputdf=inputdf[inputdf['Borough']!='Not assigned'].reset_index(drop=True)

## Replacing the values of Neighbourhood to Brough where Neighbourhood is 'not assigned'
inputdf.loc[inputdf['Neighborhood']=='Not assigned','Neighborhood']=inputdf['Borough']

In [7]:
## Transforming from multiple rows to 1 row for a Postal Code
inputdf=inputdf.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
inputdf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:

inputdf.shape

(103, 3)

## Segmenting and Clustering Neighborhoods in Toronto  - Part 2

In [9]:
geocsv = pd.read_csv('Geospatial_Coordinates.csv)
geocsv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
## Creating the merged dataframe using CSV file provided for latitudes and longitudes
geocsv.columns=['PostalCode','Latitude','Longitude']
df_merge=pd.merge(inputdf,geocsv,on='PostalCode')
df_merge.shape

(103, 5)

## Segmenting and Clustering Neighborhoods in Toronto  - Part 3

In [11]:
toronto_df=df_merge[df_merge['Borough'].str.contains('Toronto')].reset_index(drop=True)


In [12]:
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # Module to convert a location to Latitude/longitude
# !conda install -c conda-forge folium=0.5.0
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Installed')

Installed


### Get latitude and Longitude for Toronto

In [13]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
Toronto_latitude = location.latitude
Toronto_longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(Toronto_latitude, Toronto_longitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [14]:
map_toronto=folium.Map(location=[Toronto_latitude,Toronto_longitude],zoom_start=10)


### Visualizing the Toronto datframe using folium

In [15]:
for lat,lon,borough,neigh in zip(toronto_df['Latitude'],toronto_df['Longitude'],toronto_df['Borough'],toronto_df['Neighborhood']):
    label='{},{}'.format(neigh,borough)
    label=folium.Popup(label,parse_html=True)
    folium.CircleMarker([lat,lon],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_toronto)
    
map_toronto

### KMeans Clustering

In [17]:
toronto_df.head(1)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [20]:
k=5
toronto_cluster=toronto_df.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans=KMeans(n_clusters=k,random_state=10).fit(toronto_cluster)
toronto_df.insert(0,'Cluster Labels',kmeans.labels_)


array([0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       3, 3, 4, 4, 4, 2, 2, 2, 4, 1, 4, 4, 1, 1, 1, 0], dtype=int32)

In [21]:
toronto_df

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
4,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,2,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


### Create Map

In [24]:

## Create map
map_clusters=folium.Map(location=[Toronto_latitude,Toronto_longitude],zoom_start=10)


# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
