# Segmenting and Clustering Neighborhoods in Toronto

# PART 1: Scraping the Wikipedia page in order to obtain the data that is in the table of postal codes with <i>Pandas</i>

### Installing lxml library for html scraping

In [1]:
!pip install lxml



### Importing Pandas library

In [2]:
import pandas as pd

### Reading html table into a dataframe given the url

In [3]:
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

### Based on html page content, we get the first table of the list

In [4]:
df = df_html[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [5]:
# Renaming 'Postal code' column to 'PostalCode':
df.rename(columns={'Postal code':'PostalCode'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [6]:
# Get dataframe shape before cleaning it
df.shape

(180, 3)

### Only process the cells that have an assigned borough and ignoring cells with a borough that is <i>Not assigned</i>.

In [7]:
# Get names of indexes for which column 'Borough' is 'Not assigned'
indexNames = df[ df['Borough'] == 'Not assigned' ].index

# Drop such rows
df.drop(indexNames, inplace = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Get the neighborhoods grouped by Postal Code and separated with commas

In [8]:
df['Neighborhood'].replace(r' \/ ', ', ', regex=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Reset Dataframe index:

In [9]:
# Reset Dataframe index:
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Using the .shape method to print the number of rows of the dataframe

In [10]:
# Get dataframe shape after the cleaning:
df.shape

(103, 3)

# Part 2: Getting the <i>latitude</i> and the <i>longitude</i> coordinates of a given Postal Code

### Load latitude & longitude by postal code dataframe from csv

In [11]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging both dataframes by the Postal Code common column

In [12]:
df_toronto = pd.merge(df, df_geo, left_on='PostalCode', right_on='Postal Code', how='left').drop('Postal Code', axis=1)
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Part 3: Explore and cluster the neighborhoods in Toronto

### Let's get the geographical coordinates of Toronto.

In [13]:
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# map rendering library
import folium

# Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


### Displaying the Map of Toronto

In [14]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto