# Importing Libraries for the Porject

In [5]:
import pandas as pd
import numpy as np
import matplotlib as mlt
from bs4 import BeautifulSoup
import requests

*Use the Requests and Beautiful soup packages for webscraping data*

In [6]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [7]:
soup=BeautifulSoup(website_url,'lxml')

In [8]:
My_table=soup.find('table',{'class':'wikitable sortable'})

## Data transformation steps
1. Assign columns to the empty dataframe
2. Peform various clean up operations on the dataframe 
    * Remove Boroughs not assigned 
    * Assign Neighbourhoods that are not assigned with those of Boroughs

In [12]:
df=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'],index=[1])
df.size
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [13]:
rows=My_table.find_all('tr')

l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=['Postcode','Borough','Neighbourhood'])

In [14]:
df=df[df['Borough']!='Not assigned']

In [15]:
df=df.drop(0)

In [16]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [17]:
df['Neighbourhood'].replace('Not assigned', 'Queen\'s Park',inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


## In the next few steps we will read geolocation data as csv and then merge this with the existing dataframe

In [18]:
geoloc=pd.read_csv('http://cocl.us/Geospatial_data')

In [19]:
geoloc.columns = ['Postcode', 'Latitude', 'Longitude']

In [20]:
df_location=pd.merge(df,geoloc,on=['Postcode'],how='inner')

In [21]:
df_location

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353


### Using the shape function to get list of rows and columns of the new dataframe 

In [22]:
df_location.shape

(211, 5)

#### Importing various libraries to 
1. read gelocation data (geocoder)
2. visualize(folium)
3. clustering analysis(sklearn)

In [23]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [24]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(website_url,'lxml')
My_table=soup.find('table',{'class':'wikitable sortable'})
df=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'],index=[1])
df.size
df.columns
rows=My_table.find_all('tr')

l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=['Postcode','Borough','Neighbourhood'])
df=df.drop(0)

* Create a dataframe from original dataframe by filtering Boroughs with value 'Toronto'
* Then perform a merge operation of Dataframe and geolocation data

In [25]:
df_can=df[df['Borough'].str.contains('Toronto')]                                    

In [26]:
df_can

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
18,M5B,Downtown Toronto,Ryerson
19,M5B,Downtown Toronto,Garden District
35,M5C,Downtown Toronto,St. James Town
48,M4E,East Toronto,The Beaches
49,M5E,Downtown Toronto,Berczy Park
58,M5G,Downtown Toronto,Central Bay Street
59,M6G,Downtown Toronto,Christie
69,M5H,Downtown Toronto,Adelaide


In [28]:
geoloc=pd.read_csv('http://cocl.us/Geospatial_data')
geoloc.columns = ['Postcode', 'Latitude', 'Longitude']
df_location=pd.merge(df_can,geoloc,on=['Postcode'],how='inner')

In [29]:
df_location

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


In [77]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Visualise the data using Folium map

In [31]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df_location['Latitude'], df_location['Longitude'], df_location['Borough'], df_location['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [None]:
## Create a dataframe for cluster analysis

In [48]:
df_clus=df_location.drop(['Borough','Neighbourhood','Postcode'],1)

In [49]:
df_clus

Unnamed: 0,Latitude,Longitude
0,43.65426,-79.360636
1,43.65426,-79.360636
2,43.657162,-79.378937
3,43.657162,-79.378937
4,43.651494,-79.375418
5,43.676357,-79.293031
6,43.644771,-79.373306
7,43.657952,-79.387383
8,43.669542,-79.422564
9,43.650571,-79.384568


In [65]:
num_clusters = 3

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(df_clus)
labels = k_means.labels_

print(labels)

[0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 2 2 2 2 2 1
 1 2 2 2 2 1 1 2 0 0 1 1 2 2 0 0 0 2 2 2 2 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0]


In [53]:
df_cluster['Labels']=labels
df_cluster

Unnamed: 0,Postcode,Borough,Latitude,Longitude,Labels
0,M5A,Downtown Toronto,43.65426,-79.360636,1
1,M5A,Downtown Toronto,43.65426,-79.360636,1
2,M5B,Downtown Toronto,43.657162,-79.378937,1
3,M5B,Downtown Toronto,43.657162,-79.378937,1
4,M5C,Downtown Toronto,43.651494,-79.375418,1
5,M4E,East Toronto,43.676357,-79.293031,1
6,M5E,Downtown Toronto,43.644771,-79.373306,1
7,M5G,Downtown Toronto,43.657952,-79.387383,1
8,M6G,Downtown Toronto,43.669542,-79.422564,0
9,M5H,Downtown Toronto,43.650571,-79.384568,1


In [74]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
kclusters=3

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
rainbow
# add markers to the map
markers_colors = []
for lat, lon, bor,label in zip(df_cluster['Latitude'], df_cluster['Longitude'], df_cluster['Borough'], df_cluster['Labels']):
    label = folium.Popup(str(bor) + ' Cluster ' + str(label), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[label-1],
        fill=True,
        fill_color=rainbow[label-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

TypeError: unsupported operand type(s) for -: 'Popup' and 'int'