## Segmentation and clustering neighborhoods


#### Importing the pacages

In [1]:
#import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas.io.html import read_html
import io
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

#instantiate url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#use pandas to scrape wikipedia for table
toronto_table = read_html(url, attrs = {'class':'wikitable'})

#create dataframe from table
toronto_tables= pd.DataFrame(toronto_table[0])
toronto_tables.head()

#drop rows 'Not assigned' from table and set in place
index1= toronto_tables[ toronto_tables['Borough'] == 'Not assigned' ].index

toronto_tables.drop(index1, inplace=True)

#combine rows with neighborhoods falling into the same borough
df1 = toronto_tables.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

df1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [2]:


#examine number of rows and columns
df1.shape



(103, 3)

In [3]:
#establish foursquare credentials
CLIENT_ID = 'II1502AE4AKFPRWJ3YMTX3PNZYKEGA5MQNMYDBN4OROBCPE0' # your Foursquare ID
CLIENT_SECRET = '1BN2PDGVV1IOWA5HEITNDABNMKVMWVIDNC4SY4EETYWLKGYD' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: II1502AE4AKFPRWJ3YMTX3PNZYKEGA5MQNMYDBN4OROBCPE0
CLIENT_SECRET:1BN2PDGVV1IOWA5HEITNDABNMKVMWVIDNC4SY4EETYWLKGYD


In [4]:
#use pandas to read in csv with lat and long
url="https://cocl.us/Geospatial_data"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
c.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:


#concatenate tables df1 and c into result table
result = pd.concat([df1, c], axis=1, join='inner')
result.head()



Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [6]:


result.drop('Postal Code', axis=1, inplace=True)

result.rename(columns = {'Postcode':'Postal Code'}, inplace = True)
result

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [7]:
!pip install folium
import folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 7.0MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [8]:
#one hot encode to replace string with int
toronto_onehot = pd.get_dummies(result[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = result['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,"Rouge, Malvern",0,0,0,0,0,0,0,1,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,1,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,1,0,0
3,Woburn,0,0,0,0,0,0,0,1,0,0
4,Cedarbrae,0,0,0,0,0,0,0,1,0,0


In [9]:
#group by neighborhood
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,"Adelaide, King, Richmond",0,1,0,0,0,0,0,0,0,0
1,Agincourt,0,0,0,0,0,0,0,1,0,0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0,0,0,0,0,0,0,1,0,0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0,0,0,0,1,0,0,0,0,0
4,"Alderwood, Long Branch",0,0,0,0,1,0,0,0,0,0
5,"Bathurst Manor, Downsview North, Wilson Heights",0,0,0,0,0,0,1,0,0,0
6,Bayview Village,0,0,0,0,0,0,1,0,0,0
7,"Bedford Park, Lawrence Manor East",0,0,0,0,0,0,1,0,0,0
8,Berczy Park,0,1,0,0,0,0,0,0,0,0
9,"Birch Cliff, Cliffside West",0,0,0,0,0,0,0,1,0,0


In [10]:


from sklearn.cluster import KMeans
kclusters = 5

#drop neighborhood and first column of one hot table to begin clustering
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

#insert cluster labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = result

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,2,0,0,0,0,0,0,0,1,0,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,2,0,0,0,0,0,0,0,1,0,0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2,0,0,0,0,0,0,0,1,0,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2,0,0,0,0,0,0,0,1,0,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2,0,0,0,0,0,0,0,1,0,0


In [11]:
#set latitude and longitude of Toronto
latitude = 43.6532
longitude = -79.3832

toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(toronto_map)
       
toronto_map