# Segmenting and Clustering Neighborhoods in Toronto

In [293]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Extracting Data from URL

In [294]:
# specify url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#get data from url and load dataframe
postal_codes_df = pd.read_html(url)[0]
postal_codes_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Cleaning up Dataset

#### Rename column headers

In [295]:
#rename column names
column_names = ['PostalCode', 'Borough', 'Neighborhood']
postal_codes_df.columns = column_names
postal_codes_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### Filter out "Not assigned" Boroughs

In [296]:
#filter out the borough that are 'Not assigned'
postal_codes_df = postal_codes_df[postal_codes_df['Borough'] != 'Not assigned'].reset_index(drop = True)
postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Fixing "Not assigned" Neighborhoods with respective Borough

In [297]:
postal_codes_df.loc[(postal_codes_df['Neighborhood'] == "Not assigned") & 
                    (postal_codes_df['Borough'] != "Not assigned"), "Neighborhood"] = postal_codes_df.loc[(postal_codes_df['Neighborhood'] == "Not assigned") & 
                                                                                                          (postal_codes_df['Borough'] != "Not assigned"), "Borough"] 

#### Shape of Postal code dataframe

In [298]:
postal_codes_df.shape

(103, 3)

## Foursquare location data

#### Extracting latitude and longitude for each neighborhood

In [299]:
#!pip install geocoder  #uncomment to install geocoder
# import geocoder
# #get postal code
# postal_code = 'M3A'
# #initializing variable
# lat_log_coords = None
# while (lat_log_coords is None):
#     g = geocoder.google("{}, Toronto, Ontario".format(postal_code))
#     lat_log_coords = g.latlng
#     print("trying again")
    
# lat_log_coords

In [300]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_a747a310db1840d194ae6986fbdfbc77 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='zD6jremOeJGHrMuXs11426W35Q-ChP8GLrsSJPaZ2RmS',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_a747a310db1840d194ae6986fbdfbc77.get_object(Bucket='courseracapstone-donotdelete-pr-c1hqethnxxsgr5',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.columns = ["PostalCode", "Latitude", "Longitude"]
df_data_1.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [301]:
postal_codes_df = pd.merge(left = postal_codes_df,
         right = df_data_1[["PostalCode", "Latitude", "Longitude"]], 
         on = "PostalCode", 
         how = "left"
        )

postal_codes_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
