# Getting the dataframe

##### Installing what is needed

In [2]:
#!pip install beautifulsoup4
#!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


from IPython.display import display_html
import pandas as pd
import numpy as np
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

Solving environment: / ^C
- Folium installed
Libraries imported.


### Part 1
##### Getting the information from the wiki

In [23]:
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source, 'lxml')
table=str(soup.table)
lista =pd.read_html(table)
df=lista[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [24]:
df=df[df.Borough!='Not assigned']
df=df.groupby(['Postal Code', 'Borough'], sort=False).agg(', '.join)
df.reset_index(inplace=True)
df['Neighborhood']=np.where(df['Neighborhood']=='Not assigned', df['Borough'], df['Neighborhood'])
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [25]:
df.shape

(103, 3)

### Part 2
##### Adding geographical info

In [26]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [28]:
df=pd.merge(df, lat_lon, on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [29]:
df=df[df['Borough'].str.contains('Toronto', regex=False)]
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [42]:
TorontoMap=folium.Map(location=[43.65,-79.34], zoom_start=12)
for lat, lon, bor, neigh in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label='{},{}'.format(neigh, bor)
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lon], radius=4, popub=label, color='black', fill=True, fill_color='limegreen', fill_opacity=0.9, parse_html=False).add_to(TorontoMap)
TorontoMap

In [46]:
k=4
TorontoClusters=df.drop(['Postal Code', 'Borough', 'Neighborhood'], axis=1)
kClust=KMeans(n_clusters=k, random_state=0).fit(TorontoClusters)
kClust.labels_

array([1, 1, 1, 1, 3, 1, 1, 2, 1, 2, 1, 2, 3, 1, 2, 3, 1, 3, 0, 0, 0, 0,
       2, 0, 1, 2, 0, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1, 3], dtype=int32)

In [47]:
df.insert(0, 'Cluster', kClust.labels_)
df.head()

Unnamed: 0,Cluster,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,1,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,3,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [49]:
TorontoClust=folium.Map(location=[43.65,-79.34], zoom_start=12)
x=np.arange(k)
ys=[i+x+(i*x)**2 for i in range(k)]
colors=['limegreen', 'blue', 'red', 'orange']
markersColors=[]

for lat, lon, neigh, clust in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster']):
    label=folium.Popup('Cluster '+ str(clust), parse_html=True)
    folium.CircleMarker([lat, lon], radius=4, popub=label, color=colors[clust-1], fill=True, fill_color='black', fill_opacity=0.9, parse_html=False).add_to(TorontoClust)
TorontoClust