 #                     IBM APPLIED Data Science Capstone Project

#      Segmenting and Clustering Neighborhoods in Toronto


In [1]:
import numpy as np                        # library to handle data in a vectorized manner

import pandas as pd                       # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json                               # library to handle JSON files

!conda install -c conda-forge geopy --yes           # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim               # convert an address into latitude and longitude values

import requests                                    # library to handle requests
from pandas.io.json import json_normalize                 # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes          # uncomment this line if you haven't completed the Foursquare API lab
import folium                                             # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

#   WebScraping Toronto Boroughs & Pin-Codes.


Install Beautiful Soup Package & Continue....

Extract the WikiTable with 287 rows.

In [2]:
from bs4 import BeautifulSoup
import requests
import csv

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

table=soup.table.tbody

table_df=pd.DataFrame()

for string in table.stripped_strings:
    table_df=table_df.append([string])

Borough_df=pd.DataFrame(table_df.to_numpy().reshape(-1,3))
Borough_df.columns = Borough_df.iloc[0]
Borough_df=Borough_df.drop(0).reset_index(drop=True)
Borough_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


### Now that we have the required table. Let us remove 'Not assigned' [210 rows remain]  and Data Wrangling
After Wrangling, we are left with 103 unique postal codes...!

In [3]:
#Remove 'NA' Borough, if Neighbourhood="NA", use Borough value
Borough_df=Borough_df[Borough_df['Borough']!='Not assigned'].reset_index(drop=True)

Borough_df=Borough_df.replace('Not assigned',np.NaN)
Borough_df["COL3"] = Borough_df["Neighbourhood"].fillna(Borough_df["Borough"])
Borough_df=Borough_df.drop(['Neighbourhood'],axis=1).reset_index(drop=True)
Borough_df. rename (columns={'COL3':'Neighbourhood'}, inplace=True)

#Group by Postcode & Borough with Concatenation of Neighbourhood Vales

Borough_df=Borough_df.groupby(['Postcode','Borough'],as_index=False).agg(', '.join)
print(Borough_df.head())

print('\n \n The dataframe has {} boroughs \n and \n {} ROWS .'.format(
        len(Borough_df['Borough'].unique()),
        Borough_df.shape[0]))

0 Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Rouge, Malvern
1      M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough       Guildwood, Morningside, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae

 
 The dataframe has 11 boroughs 
 and 
 103 ROWS .


### Fetch Location data from the given link & Join with boroughs data [Tor_df]  

In [4]:
loc_df = pd.read_csv ('http://cocl.us/Geospatial_data')
loc_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
Tor_df = pd.merge(Borough_df, loc_df, left_on='Postcode', right_on='Postal Code', how='left')
Tor_df=Tor_df.drop(['Postal Code'],axis=1).reset_index(drop=True)
Tor_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Create a map of Toronto Boroughs using FOLIUM

Use Geopy to get Location of Toronto

Create Neighbourhood Map of Toronto

In [6]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [8]:
# create map of Toronto with Boroughs using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Tor_df['Latitude'], Tor_df['Longitude'], Tor_df['Borough'], Tor_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor