# Segmenting and Clustering Neighborhoods in Toronto: With Analysis

### 1. Initialize the Notebook

Download the dependencies required for the notebook.

In [1]:
# Import pandas 
import pandas as pd

# Import numpy
import numpy as np

# Convert an address into latitude and longitude values
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

# Matplotlib and plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import the kmeans for the clustering stage
from sklearn.cluster import KMeans

# Map rendering library
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries have been imported')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

### 2. Read the data and transform it into a dataframe

Read the data from the wikipedia page https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and get the necessary data

In [3]:
# Use the read_html to get the data into a list
read_postal = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# create the dataframe based on the list data
df_postal = read_postal[0]
df_postal.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### 3. Clean the data

Make sure that the column names match the requirements of PostalCode, Borough and Neighborhood.  Remove the 'Not assigned' data from the Borough column and if the Neighborhood column is null then replace with the Borough column.  Additionally, the data needs to be grouped by the postal code and the Neighborhood column separated by commas.

In [4]:
# Set the column names
df_postal.columns = ["PostalCode","Borough","Neighborhood"]

# Drop the 'Not assigned' from the Borough column
df_postal = df_postal[df_postal["Borough"] != "Not assigned"].reset_index(drop=True)

# Replace null Neighborhood values with Borough column values
df_postal["Neighborhood"].fillna(df_postal["Borough"], inplace=True)

# The Neighborhood column already comes grouped by the Postal code column but is separated by ' / ' so it is replaced with ', '
df_postal["Neighborhood"] = df_postal["Neighborhood"].str.replace(' / ',', ')

# Display the first 12 results of the cleaned up data
df_postal.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### 4. Show the size of the dataframe

In [5]:
# Use the .shape method to display the rows and columns in the dataframe
df_postal.shape

(103, 3)

### 5. Load the geospatial data

In [6]:
# Load the geospatial data from the csv file
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 6. Merge the geospatial data with the postal code data

In [7]:
# Merge the geospatial data
df_merge = pd.merge(left=df_postal, right=df_geo, left_on="PostalCode", right_on="Postal Code")

# Drop the Postal Code column
df_merge.drop(["Postal Code"], axis=1, inplace=True)
df_merge.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Select just the the Toronto data from the merged dataframe

In [8]:
# Selecting just the Toronto data
df_toronto = df_merge[df_merge.Borough.str.contains("toronto", case=False)]
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [9]:
# Establish how many boroughs and neighborhoods the dataframe contains
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_toronto['Borough'].unique()),
        df_toronto.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


### 7. Create the map of Toronto

Find the coordinates of Toronto using Nominatim

In [10]:
# Get the coordinates for Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


Create the map of Toronto, zoom in on it and display all of the necessary labels.

In [35]:
# Create the map of Toronto based on the coordinates and zoom in on it
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Add markers to map, display the label as "PostalCode - Borough: Neighborhood"
for lat, lng, postalcode, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['PostalCode'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{} - {}: {}'.format(postalcode, borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

# Display the map and the points

# import the HTML library
from IPython.core.display import HTML

#convert the map into HTML
HTML(map_toronto._repr_html_())