This is Dan Kim's notebook for the IBM Capstone Course on Coursera

In [40]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from sklearn.cluster import KMeans 
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [5]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = requests.get(url).text
soup = BeautifulSoup(data,"html5lib")

table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [6]:
geodata = pd.read_csv("Geospatial_Coordinates.csv")

In [13]:
geodata.rename(columns={"Postal Code":"PostalCode"},inplace=True)
geodata.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
df = df.merge(geodata,on="PostalCode")

In [17]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Basic Idea For Clustering:
I'm clustering the neighborhoods based on latitude and longitude, so basically the neighborhoods that are closest to the defined centroids will become clusters.

# Analysis Approach:
I turned the clustering and visualization logic into a function and ran it with different numbers of clusters. 

## Result:
As the number of clusters goes up, the number of neighborhoods in each cluster goes down, but each cluster is spread out over a smaller area. This could help someone choose which specific part of Toronto they want to live in.

In [20]:
X = df[["Latitude","Longitude"]]

In [26]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ibm_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [42]:
def mapTheClusters(n):
    
    num_clusters = n

    k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
    k_means.fit(X)
    labels = k_means.labels_
    
    map_toronto = folium.Map(location=[latitude, longitude], zoom_start=9.5)

    x = np.arange(num_clusters)
    ys = [i + x + (i*x)**2 for i in range(num_clusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to map
    for lat, lng, borough, neighborhood, cluster in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood'],labels):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto) 
        
    return map_toronto

In [43]:
mapTheClusters(3)

In [44]:
mapTheClusters(4)

In [45]:
mapTheClusters(5)

In [46]:
mapTheClusters(6)

In [47]:
mapTheClusters(7)

In [48]:
mapTheClusters(8)