# Segmenting and Clustering Neighborhoods in Toronto - 1# 

Let's import the necessary libraries. BeautifulSoup package is used for web scraping. Requests library is used to handle the web requests

In [29]:
import numpy as np 
import pandas as pd
from bs4 import BeautifulSoup
import requests
import geocoder
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't installed yet
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Folium installed
Libraries imported.


# Scraping the following Wikipedia page 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, transforming the table with postal codes into pandas dataframe

In [30]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content
soup = BeautifulSoup(source, 'lxml') #lxml parser is used 
print(soup) #prettify is a method which is used to indent the html file
#whihc makes it easier to under the tags

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"3b2f57d6-ac83-4a5f-9af0-bc9556041a9e","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":968796333,"wgRevisionId":968796333,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario-relat

In [31]:
# obtain the data in the table of postal codes 
table = soup.find('table')
td = table.find_all('td')
postcode = []
borough = []
neighbourhood = []

for i in range(0, len(td), 3):
    postcode.append(td[i].text.strip())
    borough.append(td[i+1].text.strip())
    neighbourhood.append(td[i+2].text.strip())
    
# The dataframe has three columns PostalCode, Borough, and Neighborhood        
Toronto_df = pd.DataFrame(data=[postcode, borough, neighbourhood]).T
Toronto_df.columns = ['Postal Code', 'Borough', 'Neighborhood']

In [32]:
# Clean Dataframe
# Ignore cells with a borough that is Not assigned.
Toronto_df['Borough'].replace('Not assigned', np.nan, inplace=True)
Toronto_df.dropna(subset=['Borough'], inplace=True)

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
Toronto_df['Neighborhood'].replace('Not assigned', "Queen's Park", inplace=True)

# More than one neighborhood can exist in one postal code area. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.
Toronto_df =Toronto_df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
Toronto_df.columns = ['Postal Code', 'Borough', 'Neighborhood']
Toronto_df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [33]:
# In the last cell of your notebook,use the .shape method to print the number of rows of your dataframe.
Toronto_df.shape

(103, 3)

In [34]:
#  csv file that has the geographical coordinates of each postal code 
url="http://cocl.us/Geospatial_data"
geospatial_df = pd.read_csv(url)
geospatial_df.columns = ['Postal Code', 'Latitude', 'Longitude']
toronto_metro_df = pd.merge(Toronto_df, geospatial_df, on=['Postal Code'], how='inner')
toronto_metro_df .head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [35]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_metro_df ['Borough'].unique()),
        toronto_metro_df .shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [36]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="coursera-capstone-project")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.6534817, -79.3839347.


In [37]:
# Work with only boroughs that contain the word Toronto.
toronto_data = toronto_metro_df[toronto_metro_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Now that we have the Neighborhoods information with their corresponding latitude and longitude values. let's create a map of Toronto with Neighborhoods.

In [38]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(
        toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']
        ):
    label = '{}, {}'.format(toronto_data, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto )  
    
map_toronto

using the Foursquare API to explore the neighborhoods and segment them


# Define the Foursquare Credentials and version you required 

In [39]:
CLIENT_ID = 'ZFT4ZHLW1GGVJXVOSZ2UQT55NTEDHBGKX2OEMEKZMXZDRM5T' # your Foursquare ID
CLIENT_SECRET = 'WW3J5UJRVU05KIOBVKDM41RWYM4BCFHC2IEKRNXZVX1OLW3N' # your Foursquare Secret
VERSION = '20200721' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZFT4ZHLW1GGVJXVOSZ2UQT55NTEDHBGKX2OEMEKZMXZDRM5T
CLIENT_SECRET:WW3J5UJRVU05KIOBVKDM41RWYM4BCFHC2IEKRNXZVX1OLW3N


Explore the Neighborhoods in the dataframe

In [40]:
toronto_data.loc[1, 'Neighborhood']

'The Danforth West, Riverdale'

In [41]:
neigh_latitude=toronto_data.loc[1,'Latitude']#this gives the nieghborhood latitude value
neigh_longitude=toronto_data.loc[1,'Longitude']#this gives the nieghborhood latitude value

#neighborhood name
neigh_name = toronto_data.loc[1, 'Neighborhood']

print('Latitude and longitude values of {} are {}  {}.'.format(neigh_name,neigh_latitude,
                                                               neigh_longitude))

Latitude and longitude values of The Danforth West, Riverdale are 43.6795571  -79.352188.


In [42]:
# number of venues returned by Foursquare API 
LIMIT = 100 
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, neigh_latitude, neigh_longitude, VERSION, radius, LIMIT)
res = requests.get(url).json()
res

{'meta': {'code': 200, 'requestId': '5f17ae71e10aee11f27c6e8c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Greektown',
  'headerFullLocation': 'Greektown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 42,
  'suggestedBounds': {'ne': {'lat': 43.6840571045, 'lng': -79.34597738331301},
   'sw': {'lat': 43.675057095499994, 'lng': -79.35839861668698}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5102df8df13673cd7084ac39',
       'name': 'MenEssentials',
       'location': {'address': '412 Danforth Ave.',
        'crossStreet': 'Near Danforth and Chester',
        'lat': 43.677820068604575,
        'lng': -79.35126543045044,
        'labeledLatLngs'

In [43]:
#function to extract the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [44]:
venues = res['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,MenEssentials,Cosmetics Shop,43.67782,-79.351265
1,Pantheon,Greek Restaurant,43.677621,-79.351434
2,La Diperie,Ice Cream Shop,43.677702,-79.352265
3,Dolce Gelato,Ice Cream Shop,43.677773,-79.351187
4,Cafe Fiorentina,Italian Restaurant,43.677743,-79.350115


In [45]:
print('Number of venues returned by the Foursquare Api is {}'.format(nearby_venues.shape[0]))

Number of venues returned by the Foursquare Api is 42
