## Segmenting and Clustering Neighborhoods in Toronto
#### Scrape the Wikipedia page to explore, segment, and cluster the neighborhoods

In [10]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image 
from IPython.core.display import HTML 
from bs4 import BeautifulSoup
from urllib.request import urlopen

### 1. Scraping data from a website

In [278]:
#using panda to read the HTML data from Wikipedia
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### 2. Creating a table

In [279]:
#selecting the first table from the HTML file 
data= pd.DataFrame(table[0])
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### 3. Data cleaning and structuring

In [280]:
#filtering the Borough feature and dropping each row that contains 'Not assigned'
indexname=data[data['Borough']== 'Not assigned'].index
data.drop(indexname, inplace=True)
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [281]:
type(data)

pandas.core.frame.DataFrame

In [282]:
#grouping the Neighbourhood by postal code feature
data.groupby(['Postal Code'],as_index=False)
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [283]:
#checks if given value exist in the dataframe
result = data.isin(['Not assigned']).any().any()
if result:
    print('Element exists in Dataframe')
else:
    print('none')

none


In [284]:
data.shape

(103, 3)

### 4. Data Exploration

In [323]:
# transform list-likes cells in Neighbourhood column to individual rows
data1=data.assign(Neighbourhood=data.Neighbourhood.str.split(',')).explode('Neighbourhood')
data1['Neighbourhood'] #check number of Neighbourhoods

0                      Parkwoods
1               Victoria Village
2                    Regent Park
2                   Harbourfront
3                 Lawrence Manor
                 ...            
102                    Mimico NW
102           The Queensway West
102               South of Bloor
102     Kingsway Park South West
102        Royal York South West
Name: Neighbourhood, Length: 217, dtype: object

In [324]:
# check how many neighborhoods each Borough has 
data1.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Central Toronto,18,18
Downtown Toronto,39,39
East Toronto,8,8
East York,7,7
Etobicoke,47,47
Mississauga,1,1
North York,38,38
Scarborough,38,38
West Toronto,13,13
York,8,8


### 5. Cluster Neighborhoods

In [343]:
from geopy.geocoders import Nominatim 
from sklearn.cluster import KMeans
import folium

In [350]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent = 'Toronto_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Toronto are 43.6534817, -79.3839347.


In [368]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [365]:
conda install -c conda-forge geopy

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [392]:
data1['address']=data1['Neighbourhood']+','+ 'Toronto, Canada'

In [393]:
from geopy.extra.rate_limiter import RateLimiter
# create locator that holds the Geocoding service, Nominatim
locator = Nominatim(user_agent='Toronto_explorer')
# conveneint function to delay between geocoding calls: RateLimiter
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# create location column
data1['location']=data1['address'].apply(geocode)
# create longitude, latitude and altitude from location column (returns tuple)
data1['point']=data1['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# split point column into latitude, longitude and altitude columns
data1[['Latitude', 'Longitude', 'altitude']]= pd.DataFrame(data1['point'].tolist(), index=data1.index)

In [394]:
data1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,longitude,location,point,Longtitude,altitude,address
0,M3A,North York,Parkwoods,43.7588,,"(Parkwoods Village Drive, Parkway East, Don Va...","(43.7587999, -79.3201966, 0.0)",-79.320197,0.0,"Parkwoods,Toronto, Canada"
1,M4A,North York,Victoria Village,43.732658,,"(Victoria Village, Don Valley East, North York...","(43.732658, -79.3111892, 0.0)",-79.311189,0.0,"Victoria Village,Toronto, Canada"
2,M5A,Downtown Toronto,Regent Park,43.660706,,"(Regent Park, Toronto Centre, Old Toronto, Tor...","(43.6607056, -79.3604569, 0.0)",-79.360457,0.0,"Regent Park,Toronto, Canada"
2,M5A,Downtown Toronto,Harbourfront,43.64008,,"(Harbourfront, Spadina—Fort York, Old Toronto,...","(43.6400801, -79.3801495, 0.0)",-79.38015,0.0,"Harbourfront,Toronto, Canada"
3,M6A,North York,Lawrence Manor,43.722079,,"(Lawrence Manor, Eglinton—Lawrence, North York...","(43.7220788, -79.4375067, 0.0)",-79.437507,0.0,"Lawrence Manor,Toronto, Canada"


In [396]:
data1=data1.drop(['location', 'longitude', 'point', 'altitude'], axis=1)
data1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longtitude,address
0,M3A,North York,Parkwoods,43.7588,-79.320197,"Parkwoods,Toronto, Canada"
1,M4A,North York,Victoria Village,43.732658,-79.311189,"Victoria Village,Toronto, Canada"
2,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,"Regent Park,Toronto, Canada"
2,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015,"Harbourfront,Toronto, Canada"
3,M6A,North York,Lawrence Manor,43.722079,-79.437507,"Lawrence Manor,Toronto, Canada"


In [397]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(data1['Latitude'], data1['Longtitude'], data1['Borough'], data1['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

KeyError: 'Longitude'

In [330]:
# set number of clusters
kclusters = 10
data1_clustering = data1.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(data1_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

ValueError: could not convert string to float: 'Etobicoke'