## Lets Start with Importing required Libraries

In [1]:
import numpy as np
import pandas as pd
import json
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!pip install geopy 
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                       

### Install BeautifulSoup for the retrieval of the required table from the wikipedia link and extract the table.

In [2]:
!pip install BeautifulSoup4
from bs4 import BeautifulSoup

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'html.parser')

my_table = soup.find('table',{'class':'wikitable sortable'})
# print(my_table)
A=[]
B=[]
C=[]

for row in my_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))



#### Conversion of the table into a DataFrame

In [3]:
df = pd.DataFrame({'Postal_Code':A,'Borough':B,'Neighbourhood':C})
df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Getting rid of rows with Borough as "Not assigned"

In [4]:
df.drop(df.loc[df["Borough" ] == "Not assigned\n"].index, inplace=True)
df.replace(r'\s+|\\n', "",regex=True, inplace = True)
df.head(10)

Unnamed: 0,Postal_Code,Borough,Neighbourhood
2,M3A,NorthYork,Parkwoods
3,M4A,NorthYork,VictoriaVillage
4,M5A,DowntownToronto,"RegentPark,Harbourfront"
5,M6A,NorthYork,"LawrenceManor,LawrenceHeights"
6,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment"
8,M9A,Etobicoke,"IslingtonAvenue,HumberValleyVillage"
9,M1B,Scarborough,"Malvern,Rouge"
11,M3B,NorthYork,DonMills
12,M4B,EastYork,"ParkviewHill,WoodbineGardens"
13,M5B,DowntownToronto,"GardenDistrict,Ryerson"


##### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [5]:
df.loc[df['Neighbourhood'] =='Notassigned' , 'Neighbourhood'] = df['Borough']

In [6]:
column_names = ["Postal_Code", "Borough", "Neighbourhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df[df["Postal_Code"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M5G,DowntownToronto,CentralBayStreet
1,M2H,NorthYork,HillcrestVillage
2,M4B,EastYork,"ParkviewHill,WoodbineGardens"
3,M1J,Scarborough,ScarboroughVillage
4,M4G,EastYork,Leaside
5,M4M,EastToronto,StudioDistrict
6,M1R,Scarborough,"Wexford,Maryvale"
7,M9V,Etobicoke,"SouthSteeles,Silverstone,Humbergate,Jamestown,..."
8,M9L,NorthYork,HumberSummit
9,M5V,DowntownToronto,"CNTower,KingandSpadina,RailwayLands,Harbourfro..."


##### Applying Groupby

In [8]:
result = df.groupby(['Postal_Code','Borough'], sort=False).agg( ', '.join)
df_final = result.reset_index()
df_final.head(10)

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M3A,NorthYork,Parkwoods
1,M4A,NorthYork,VictoriaVillage
2,M5A,DowntownToronto,"RegentPark,Harbourfront"
3,M6A,NorthYork,"LawrenceManor,LawrenceHeights"
4,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment"
5,M9A,Etobicoke,"IslingtonAvenue,HumberValleyVillage"
6,M1B,Scarborough,"Malvern,Rouge"
7,M3B,NorthYork,DonMills
8,M4B,EastYork,"ParkviewHill,WoodbineGardens"
9,M5B,DowntownToronto,"GardenDistrict,Ryerson"


## Downloading Geospatial_data of Toronto

In [11]:
locdf = pd.read_csv('http://cocl.us/Geospatial_data')
locdf.columns=['Postal_Code','Latitude','Longitude']
locdf.head()

Unnamed: 0,Postal_Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Adding Lat and Long columns

In [12]:
new_final = pd.merge(df_final, locdf[['Postal_Code','Latitude', 'Longitude']], on = 'Postal_Code')
new_final

Unnamed: 0,Postal_Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,NorthYork,Parkwoods,43.753259,-79.329656
1,M4A,NorthYork,VictoriaVillage,43.725882,-79.315572
2,M5A,DowntownToronto,"RegentPark,Harbourfront",43.654260,-79.360636
3,M6A,NorthYork,"LawrenceManor,LawrenceHeights",43.718518,-79.464763
4,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment",43.662301,-79.389494
5,M9A,Etobicoke,"IslingtonAvenue,HumberValleyVillage",43.667856,-79.532242
6,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
7,M3B,NorthYork,DonMills,43.745906,-79.352188
8,M4B,EastYork,"ParkviewHill,WoodbineGardens",43.706397,-79.309937
9,M5B,DowntownToronto,"GardenDistrict,Ryerson",43.657162,-79.378937


## Creating clusters on the map

In [13]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=10)

X = new_final['Latitude']
Y = new_final['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
new_final['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(new_final['Latitude'], new_final['Longitude'], new_final['Borough'], new_final['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map