# Coursera Capstone Project

In [1]:
#import required libraries
import pandas as pd
from bs4 import BeautifulSoup as bsp
import requests

In [2]:
#use requests to 'get'the url text
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#pass this source (and lxml) to BeautifulSoup
soup = bsp(source, 'lxml')

#scrape the webpage for the table of canada neighbourhoods
table = soup.find('table', class_='wikitable sortable')

#collect the table column titles into a list
table_columns = []
for th in table.find_all('th'):
    table_columns.append(th.text)


In [3]:
# replace the 'Neighbourhood\n' in the list with 'Neighbourhood'
table_columns[2] = 'Neighbourhood'
table_columns

['Postcode', 'Borough', 'Neighbourhood']

In [4]:
# collect the table row values into a row of lists 
output_rows = []
for table_row in table.find_all('tr'): # find all tr -> table rows
    columns =  table_row.find_all('td') # within each row, find all table data belonging to that row
    output_row = []
    for column in columns:
         output_row.append(column.text) # append each data to its column 
    output_rows.append(output_row) #append each row to its row


In [5]:
can_df = pd.DataFrame(output_rows)

In [6]:
can_df.columns = table_columns
can_df.drop([0], axis = 0, inplace=True)

In [7]:
can_df.iloc[:, 2].head(5)

1        Not assigned\n
2        Not assigned\n
3           Parkwoods\n
4    Victoria Village\n
5        Harbourfront\n
Name: Neighbourhood, dtype: object

In [8]:
# the values in Neighbourhood column need to be stripped of '\n' special character
can_df['Neighbourhood'] = can_df['Neighbourhood'].map(lambda x:str(x)[:-1])

In [9]:
can_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [10]:
can_df.shape[0]

288

In [11]:
# some Postcodes have Boroughs but neighbourhood 'Not assigned'
# assign to a Neighbourhood with 'Not Assigned' the value of it's Borough 
for i in range(can_df.shape[0]):
    if (can_df.iloc[i,1] !='Not assigned') & (can_df.iloc[i,2] == 'Not assigned'):
        can_df.iloc[i,2] = can_df.iloc[i,1]

In [12]:
can_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
10,M8A,Not assigned,Not assigned


In [13]:
# some boroughs have the same postcodes but different neighbourhoods
# group together these boroughs based on Postcodes and aggregate the values of their neighbourhood
aggregate_func = {'Borough':'first', 'Neighbourhood': lambda x: ', '.join(x)}
can_df_new = can_df.groupby(can_df['Postcode']).aggregate(aggregate_func)


In [14]:
can_df_new

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1A,Not assigned,Not assigned
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"


In [15]:
#reset index
can_df_new.reset_index(inplace=True)
can_df_new.shape

(180, 3)

In [16]:
# we need to the Postodes with Borough 'Not assigned'
#But first we must collect their indices
drop_list = []
for i in range(can_df_new.shape[0]):
    if can_df_new.iloc[i,1] == 'Not assigned':
        drop_list.append(i)
drop_list[0:9]

[0, 18, 19, 20, 21, 22, 23, 24, 33]

In [17]:
# drop these rows
can_df_new.drop(drop_list, axis=0, inplace=True)

In [18]:
# reset index to make the count start from 0
can_df_new.reset_index(inplace=True)
can_df_new.head(5)

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,1,M1B,Scarborough,"Rouge, Malvern"
1,2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,4,M1G,Scarborough,Woburn
4,5,M1H,Scarborough,Cedarbrae


In [19]:
# Let's drop the 'index' column
can_df_new.drop(['index'], axis=1, inplace=True)

In [20]:
# finally our toronto neighbourhood!!!
tor_neighbor = can_df_new
tor_neighbor.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [21]:
tor_neighbor.shape

(103, 3)

In [22]:
lat_lng_coords = pd.read_csv('~/Desktop/projects/github-example/Geospatial_Coordinates.csv')

In [23]:
lat_lng_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
tor_neighbor[['Latitude', 'Longitude']] = lat_lng_coords[['Latitude', 'Longitude']]

In [25]:
tor_neighbor.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [26]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [27]:
conda update -n base -c defaults conda

Collecting package metadata (repodata.json): done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::numba==0.43.1=py37h962f231_0
done

## Package Plan ##

  environment location: /home/emperor/anaconda3

  added / updated specs:
    - conda


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    babel-2.7.0                |             py_0         5.8 MB
    bzip2-1.0.8                |       h7b6447c_0         105 KB
    chardet-3.0.4              |        py37_1003         173 KB
    cloudpickle-1.2.2          |             py_0          29 KB
    cryptography-2.7           |   py37h1ba5d50_0         608 KB
    defusedxml-0.6.0           |             py_0          23 KB
    docutils-0.15.2            |           py37_0         736 KB
    filelock-3.0.12            | 

In [28]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(tor_neighbor['Borough'].unique()),
        tor_neighbor.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [29]:
tor_neighbor['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [30]:
Toronto_data = tor_neighbor[tor_neighbor['Borough'].str.endswith('Toronto')]
                            

In [31]:
Toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [32]:
Toronto_data.reset_index(inplace=True)

In [33]:
Toronto_data.drop(['index'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [34]:
Toronto_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [35]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ont_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [36]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_data[['Borough']], prefix="", prefix_sep="")

# add borough column back to dataframe
Toronto_onehot['Borough'] = Toronto_data['Borough'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,East Toronto,0,0,1,0
1,East Toronto,0,0,1,0
2,East Toronto,0,0,1,0
3,East Toronto,0,0,1,0
4,Central Toronto,1,0,0,0


In [37]:
Toronto_grouped = Toronto_data.groupby('Borough').mean().reset_index()
Toronto_grouped

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654169,-79.383665
2,East Toronto,43.669436,-79.324654
3,West Toronto,43.652653,-79.44929


In [38]:
# set number of clusters
kclusters = 4

Toronto_grouped_clustering = Toronto_onehot.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

array([0, 0, 0, 0, 2], dtype=int32)

In [39]:
# add clustering labels
Toronto_data.insert(5, 'Cluster Labels', kmeans.labels_)

Toronto_data.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2


In [40]:
 #create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x  for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Borough'], Toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters