# Assignment for segmenting neighborhoods of Toronto, CA

In [1]:
#libraries that might need to be installed
!pip3 install geopy
!pip3 install folium
!pip3 install numpy==1.16.2

Collecting geopy
  Using cached https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl
Collecting geographiclib<2,>=1.49 (from geopy)
  Using cached https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0
Collecting folium
  Using cached https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl
Collecting branca>=0.3.0 (from folium)
  Using cached https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Collecting jinja2>=2.9 (from folium)
  Using cached https://files.pythonhosted.org/packages/7b/af/b9ed1959cb4bb7332e2b0797476c878fa38d200bfcfe38c6d53

In [2]:
#import standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#import html/scraping libraries
import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize

#import mapping libraries
import geopy
import folium

#import clustering libraries
from sklearn.cluster import KMeans, DBSCAN

## In this section, we will scrape and clean the data to produce the correct dataframe

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

t_wiki = BeautifulSoup(requests.get(url).text,'lxml') #get soup

In [4]:
cols = ['Postcode',"Borough",'Neighbourhood']#create columns
pID = [] #list of postal ids
bID = [] #list of borough names
nID = [] #list of neighbourhoods
#the next line finds the table, then separates the rows
locs = t_wiki.find_all('tbody')[0].find_all('tr')[1:]

#loop over each row and extract the information
for n in locs:
    d = n.find_all('td')
    pID.append(d[0].text)
    bID.append(d[1].text)
    nID.append(d[2].text.strip())

In [5]:
#create a data frame
t_df = pd.DataFrame(columns=cols)
t_df['Postcode'] = pID
t_df['Borough'] = bID
t_df['Neighbourhood'] = nID
    

In [6]:
t_df.shape

(287, 3)

In [7]:
#Clean data frame to specifications
#This code ignores area without a borough assignment
t_df = t_df[t_df['Borough']!= 'Not assigned']

#This code will assign missing neighborhood names with borough names
correct_hoods = [t_df.iloc[i].Neighbourhood if t_df.iloc[i].Neighbourhood != 'Not assigned'\
                         else t_df.iloc[i].Borough for i in range(len(t_df))]
t_df['Neighbourhood'] = correct_hoods

#Finally, this code will group neighbourhoods by postal code and make lists of hoods
t_df = t_df.groupby('Postcode').agg(lambda x: ', '.join(set(x))).reset_index()

In [70]:
print(t_df.shape)

(103, 3)


In [72]:
t_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union"
2,M1E,Scarborough,"West Hill, Guildwood, Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Setting Latitude and Longitude coordinates

In [93]:
import geocoder

pcodes = t_df.Postcode
g = geocoder.google('{}, Toronto, Ontario'.format(pcodes[0]))
g

<[REQUEST_DENIED] Google - Geocode [empty]>

In [95]:
!ls

ds-cap-nb1.ipynb  Geospatial_Coordinates.csv  README.md  Toronto.ipynb


In [97]:
latlong_df = pd.read_csv('Geospatial_Coordinates.csv')
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [100]:
t_df['Latitude'] = latlong_df['Latitude']
t_df['Longitude']= latlong_df['Longitude']
t_df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Highland Creek, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Guildwood, Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Exploration and Clustering

In this section, I will explore and cluster the neighborhoods of Torono.
To start, I will explore all of toronto, then narrow my focus as the analysis goes on.

In [None]:
First we will get the coordinates centered around toronto.

In [111]:
from geopy.geocoders import Nominatim

address = "Toronto, ON"

geolocator = Nominatim(user_agent="extra_lime")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Torono are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Torono are 43.653963, -79.387207.


In [106]:
t_df[t_df['Borough']=='Downtown Toronto']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
51,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
54,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
55,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
58,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
59,M5J,Downtown Toronto,"Toronto Islands, Union Station, Harbourfront East",43.640816,-79.381752
