<h1 style="text-align: center;">Segmenting and Clustering Neighborhoods in Toronto</h1>

### First, import the required modules

In [1]:
import pandas as pd
import html5lib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

### Now, load the data

In [2]:
df_postal = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df_postal.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean the data

In [3]:
df_postal.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_postal = df_postal[df_postal['Borough'] != "Not assigned"]
df_postal.reset_index(drop=True, inplace=True)
# Sorting the dataframe by the postal code to match the new dataframe
df_postal.sort_values('PostalCode', inplace=True)
df_postal.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


### Check if there is a cell is not assigned for Neighborhood

In [4]:
df_postal[df_postal['Neighborhood'] == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


### The shape of the dataframe:

In [5]:
df_postal.shape

(103, 3)

### Download the geospatial data

In [6]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded')

Data downloaded


### Load the data

In [7]:
with open('Geospatial_Coordinates.csv') as csv_data:
    geo_data = pd.read_csv(csv_data)

### Create the new dataframe

In [8]:
df_new = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude'])
df_new['PostalCode'] = df_postal['PostalCode'] = geo_data['Postal Code']
df_new['Borough'] = df_postal['Borough']
df_new['Neighborhood'] = df_postal['Neighborhood']
df_new['Latitude'] = geo_data['Latitude']
df_new['Longitude'] = geo_data['Longitude']

### Display the first 5 rows

In [9]:
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,North York,Parkwoods,43.806686,-79.194353
1,M1C,North York,Victoria Village,43.784535,-79.160497
2,M1E,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M1G,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M1H,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476


### Let's create a map for Toronto!

#### Get the coordinate

In [10]:
address = 'City of Toronto, ON'
geo_locater = Nominatim(user_agent="tr_explorer")
location = geo_locater.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'Toronto location is [{latitude}, {longitude}]')

Toronto location is [43.7170226, -79.41978303501344]


#### Let's define the map

In [11]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(df_new['Latitude'], df_new['Longitude'], df_new['Borough'], df_new['Neighborhood']):
    label = f'{neighborhood}, {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color="blue",
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)
toronto_map        