### 1. Import the necessary packages

In [57]:
from bs4 import BeautifulSoup
import requests
from itertools import zip_longest
import pandas as pd
import folium
from sklearn.cluster import KMeans
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

### 2. Scrap the Wikipedia page to load the table

In [4]:
# Make a GET request to fetch the raw HTML content
html_content = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

# Parse the html content
soup = BeautifulSoup(html_content,features="html.parser")

#print(soup.prettify())
#print("Title: ", soup.title.text)

postal_table=soup.find("table",attrs={"class": "wikitable"})
postal_table_data=postal_table.tbody.find_all("tr")

# Get the headings in the table
headings=[]

for th in postal_table_data[0].find_all("th"):
        headings.append(th.text.replace('\n','').strip())

# Get the rest of the table
data = []
data.append(headings)
for tr in postal_table.tbody.find_all("tr"):
        table_data=[]
        # Get the data
        for td in tr.find_all("td"):
                t_row=td.text.replace('\n','').strip()
                table_data.append(t_row)

        data.append(table_data)

# Convert the into DataFrame
df=pd.DataFrame(data)
new_header = (df.iloc[0]).to_list() 
df.columns = new_header
df=df[2:]
df.reset_index(drop=True,inplace=True)
df.rename(columns={"Postal code": "Postal Code"},inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### 3. Clear the dataset

In [5]:
index=list(range(0,len(df)))

ind_list=[]
# Ignore rows with "Not assigned" values
for index,row in df.iterrows():
    if row[1]=='Not assigned':
        ind_list.append(index)

df.drop(ind_list,inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### 4. Shape of the dataset

In [6]:
df.shape

(103, 3)

### 5. Add the coordination column to data

In [7]:
df_coor=pd.read_csv("Geospatial_Coordinates.csv")
df_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 6. Merge two dataset

In [8]:
df_new= pd.merge(df_coor, df, on='Postal Code')
df_new

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,Malvern / Rouge
1,M1C,43.784535,-79.160497,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,43.763573,-79.188711,Scarborough,Guildwood / Morningside / West Hill
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
...,...,...,...,...,...
98,M9N,43.706876,-79.518188,York,Weston
99,M9P,43.696319,-79.532242,Etobicoke,Westmount
100,M9R,43.688905,-79.554724,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,43.739416,-79.588437,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


### 6. Neigborhood Segmentation Part

I have chosen 'North York' borough for clustering. Therefore, I've subset the dataset as it only contains rows of North York.

In [16]:
df_york = df_new.loc[df_new['Borough']=='North York',:]
df_york.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
17,M2H,43.803762,-79.363452,North York,Hillcrest Village
18,M2J,43.778517,-79.346556,North York,Fairview / Henry Farm / Oriole
19,M2K,43.786947,-79.385975,North York,Bayview Village
20,M2L,43.75749,-79.374714,North York,York Mills / Silver Hills
21,M2M,43.789053,-79.408493,North York,Willowdale / Newtonbrook


I get the coordination of Toronto from Google instead of using geocoder that we did in the lab. It seems more easy.

In [17]:
latitude = 43.6529
longitude = -79.3849
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6529, -79.3849.


Now, let's create map of Toronto using our coordination values and put the neighborhood marks on the map.
##### Maps display the city of Toronto, but marker only points the neighborhood of North York.

In [117]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_york['Latitude'], df_york['Longitude'], df_york['Borough'], df_york['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Clustering the neighborhood using Kmeans

In [113]:
# set number of clusters
kclusters = 5

df_york_clustering = df_york.drop(['Postal Code','Borough','Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_york_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([0, 0, 0, 0, 0, 0, 0, 3, 2, 2, 2, 3, 1, 3, 1, 1, 4, 2, 3, 3, 3, 1,
       4, 4], dtype=int32)

In [115]:
df_york.insert(0, 'Cluster Labels', kmeans.labels_)
df_york.head()

Unnamed: 0,Cluster Labels,Postal Code,Latitude,Longitude,Borough,Neighborhood
17,0,M2H,43.803762,-79.363452,North York,Hillcrest Village
18,0,M2J,43.778517,-79.346556,North York,Fairview / Henry Farm / Oriole
19,0,M2K,43.786947,-79.385975,North York,Bayview Village
20,0,M2L,43.75749,-79.374714,North York,York Mills / Silver Hills
21,0,M2M,43.789053,-79.408493,North York,Willowdale / Newtonbrook


### The map might not be visible on Github

In [118]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_york['Latitude'], df_york['Longitude'], df_york['Neighborhood'], df_york['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters