# Importing Libraries for the Porject

In [4]:
import pandas as pd
import numpy as np
import matplotlib as mlt
from bs4 import BeautifulSoup
import requests

*Use the Requests and Beautiful soup packages for webscraping data*

In [5]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [6]:
soup=BeautifulSoup(website_url,'lxml')

In [7]:
My_table=soup.find('table',{'class':'wikitable sortable'})

## Data transformation steps
1. Assign columns to the empty dataframe
2. Peform various clean up operations on the dataframe 
    * Remove Boroughs not assigned 
    * Assign Neighbourhoods that are not assigned with those of Boroughs

In [8]:
df=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'],index=[1])
df.size
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [66]:
rows=My_table.find_all('tr')

l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=['Postcode','Borough','Neighbourhood'])

In [78]:
df=df[df['Borough']!='Not assigned']
dftt=df.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge


In [71]:
df.drop(0)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge


In [65]:
df['Neighbourhood'].replace('Not assigned', 'Queen\'s Park',inplace=True)
df.shape

KeyError: 'Neighbourhood'

## In the next few steps we will read geolocation data as csv and then merge this with the existing dataframe

In [14]:
geoloc=pd.read_csv('http://cocl.us/Geospatial_data')

In [15]:
geoloc.columns = ['Postcode', 'Latitude', 'Longitude']

In [16]:
df_location=pd.merge(df,geoloc,on=['Postcode'],how='inner')

In [42]:
df_location

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


### Using the shape function to get list of rows and columns of the new dataframe 

In [18]:
df_location.shape

(211, 5)

#### Importing various libraries to 
1. read gelocation data (geocoder)
2. visualize(folium)
3. clustering analysis(sklearn)

In [19]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [113]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(website_url,'lxml')
My_table=soup.find('table',{'class':'wikitable sortable'})
df=pd.DataFrame(columns=['Postcode','Borough','Neighbourhood'],index=[1])
df.size
df.columns
rows=My_table.find_all('tr')

l = []
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=['Postcode','Borough','Neighbourhood'])
df=df.drop(0)
df.shape
df_can=df[df['Borough']!='Not assigned']
df_can.shape
df_tor=df_can.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_tor.columns = ['Postcode', 'Borough', 'Neighborhood']
df_tor.shape

(103, 3)

* Create a dataframe from original dataframe by filtering Boroughs with value 'Toronto'
* Then perform a merge operation of Dataframe and geolocation data

In [114]:
df_can=df[df['Borough'].str.contains('Toronto')] 
df_can.shape

In [115]:
df_tor.shape

(74, 3)

In [23]:
geoloc=pd.read_csv('http://cocl.us/Geospatial_data')
geoloc.columns = ['Postcode', 'Latitude', 'Longitude']
df_location=pd.merge(df_can,geoloc,on=['Postcode'],how='inner')

In [24]:
df_location

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


In [25]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Visualise the data using Folium map

In [26]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df_location['Latitude'], df_location['Longitude'], df_location['Borough'], df_location['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [27]:
## Create a dataframe for cluster analysis

In [28]:
df_clus=df_location.drop(['Borough','Neighbourhood','Postcode'],1)

In [29]:
df_clus

Unnamed: 0,Latitude,Longitude
0,43.65426,-79.360636
1,43.65426,-79.360636
2,43.657162,-79.378937
3,43.657162,-79.378937
4,43.651494,-79.375418
5,43.676357,-79.293031
6,43.644771,-79.373306
7,43.657952,-79.387383
8,43.669542,-79.422564
9,43.650571,-79.384568


In [30]:
num_clusters = 3

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(df_clus)
labels = k_means.labels_

print(labels.dtype)

int32


In [31]:
df_clus['Labels']=labels
df_clus

Unnamed: 0,Latitude,Longitude,Labels
0,43.65426,-79.360636,0
1,43.65426,-79.360636,0
2,43.657162,-79.378937,0
3,43.657162,-79.378937,0
4,43.651494,-79.375418,0
5,43.676357,-79.293031,0
6,43.644771,-79.373306,0
7,43.657952,-79.387383,0
8,43.669542,-79.422564,1
9,43.650571,-79.384568,0


In [80]:
dftt.drop(0)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge


In [81]:
dftt=dftt.groupby('Postcode',as_index=False)

In [83]:
dftt=pd.series(dict(PostCode=dftt['Postcode'].values,Borough=dftt['Borough'].values,Neighbourhood=))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00A05510>
