## Segmenting and Clustering Neighborhoods in Toronto

Carlos Eduardo Bittencourt

In [17]:
# first pip install beautifulsoup4 to help make the html link into a data frame

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response=requests.get(wikiurl)
print(response.status_code)

200


In [4]:
# parse data from the html into a beautifulsoup object 
soup = BeautifulSoup(response.text, 'html.parser')
can_postal_table = soup.find('table', {'class':"wikitable"})

In [5]:
# read the HTML table into a list of dataframe object using read_html().

In [6]:
df = pd.read_html(str(can_postal_table))
df = pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
df[df['Borough'] == 'Not assigned'].value_counts()

Postal Code  Borough       Neighbourhood
M9Z          Not assigned  Not assigned     1
M5Y          Not assigned  Not assigned     1
M3T          Not assigned  Not assigned     1
M3V          Not assigned  Not assigned     1
M3W          Not assigned  Not assigned     1
                                           ..
M7Z          Not assigned  Not assigned     1
M8A          Not assigned  Not assigned     1
M8B          Not assigned  Not assigned     1
M8C          Not assigned  Not assigned     1
M1A          Not assigned  Not assigned     1
Length: 77, dtype: int64

In [8]:
# removing rows with 'Not assigned' value
df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True, inplace=True)
df.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
# the shape of the dataframe, 103 rows!
df.shape

(103, 3)

## PART 2

In [10]:
# loading the csv file with the coordenates into a dataframe
coord = pd.read_csv('http://cocl.us/Geospatial_data')

In [11]:
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
# merging the two data frames on postal code, so we have a new one with all the information
df_coord = pd.merge(df, coord, on='Postal Code')

In [16]:
df_coord.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [18]:
from geopy.geocoders import Nominatim
import folium 

In [28]:
# create map of Toronto and it's neighbourhoods, then the markers
map_toronto = folium.Map(location=[43.6532, 79.3832], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(df_coord['Latitude'], df_coord['Longitude'], df_coord['Borough'], df_coord['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto    