# Notebook for final pfoject of IBM's professional Data Science specialization

In [1]:
import pandas as pd
import numpy as np
from pandas.io.html import read_html

In [2]:
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

In [3]:
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
DFs = read_html(page, match='Borough', header=0) # a list of dataframes
df = DFs[0]#getting the dataframe frm the list

### Exploring the data:

In [7]:
df.describe()

Unnamed: 0,Postcode,Borough,Neighbourhood
count,288,288,288
unique,180,12,209
top,M9V,Not assigned,Not assigned
freq,8,77,78


In [18]:
print('not assigned boroughs: ', df.loc[df.Borough == 'Not assigned', 'Borough'].count())
print('not assigned neighbourhoods: ', df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'].count())

not assigned boroughs:  77
not assigned neighbourhoods:  78


### Ignore cells with a borough that is Not assigned:

In [19]:
df.Borough.replace('Not assigned', np.nan, inplace=True)

In [21]:
df.dropna(subset=['Borough'], inplace=True)
print('not assigned boroughs: ', df.loc[df.Borough == 'Not assigned', 'Borough'].count())

not assigned boroughs:  0


### grouping repeated neighbourhoods/postcode:

In [22]:
df.set_index(['Postcode', 'Borough'], inplace=True)

In [23]:
df.groupby(level=['Postcode','Borough']).agg(','.join)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


### processing not assigned neighbourhoods:

In [27]:
df.reset_index(inplace=True)
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [29]:
print('not assigned neighbourhoods: ', df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'].count())


not assigned neighbourhoods:  1


In [30]:
df.loc[df.Neighbourhood == 'Not assigned', 'Borough']

6    Queen's Park
Name: Borough, dtype: object

In [32]:
df.at[85, 'Neighbourhood'] = 'Queen\'s park'

In [34]:
df.shape

(211, 3)

### Getting longitude and latitude of postcodes:

In [40]:
import geocoder

In [75]:
latitude = []
longitude = []
coordinates = None
for code in df.Postcode:
    while (coordinates == None):
        coordinates = geocoder.google(code + ', Toronto, Ontario').latlng
    latitude.append(coordinates[0])
    longitude.append(coordinates[1])

KeyboardInterrupt: 

##### the geocoder package is very un reliable reliable, hence I will use a file that provides the required coordinates

In [79]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [99]:
df['Latitude'] = 0
df['Longitude'] = 0

In [114]:
for code in df.Postcode:
    if code in coordinates['Postal Code'].values:
        print(coordinates.loc[coordinates['Postal Code'] == code, 'Latitude'])
        df.loc[df.Postcode == code, 'Latitude'] = coordinates.loc[coordinates['Postal Code'] == code, 'Latitude']
        df.loc[df.Postcode == code, 'Longitude'] = coordinates.loc[coordinates['Postal Code'] == code, 'Longitude']        

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Harbourfront,,
3,M5A,Downtown Toronto,Regent Park,,
4,M6A,North York,Lawrence Heights,,
5,M6A,North York,Lawrence Manor,,
6,M7A,Queen's Park,Not assigned,,
7,M9A,Etobicoke,Islington Avenue,,
8,M1B,Scarborough,Rouge,,
9,M1B,Scarborough,Malvern,,


In [94]:
coordinates.loc[coordinates['Postal Code'] == 'M1B', 'Latitude']

0    43.806686
Name: Latitude, dtype: float64