# Clustering and Segmenting Neighborhoods in Toronto

## Week 3 - Capstone Project Assaignment

In [None]:
!conda install -c conda-forge beautifulsoup4 --yes

!conda install -c conda-forge geopy --yes

!conda install -c conda-forge folium=0.5.0 --yes

print('Libraries installed!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    beautifulsoup4-4.9.1       |   py36h9f0ad1d_0         163 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.6 MB

The following NEW packages will be INSTALLED:

    python_abi:      3.6-1_cp36m       conda-forge

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1                

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows' , None)

import requests
import json
from pandas.io.json import json_normalize

from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

print("Libraries Imported")

In [None]:
# open Wiki page with Beautiful Soup
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

In [None]:
postalCodeList = []
boroughList = []
neighborhoodList = []

for row in soup.find('table').find_all('tr') :
    cells = row.find_all('td')
    if(len(cells) > 0) :
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n')) #removing newline character from cell

In [None]:
# creating the dataframe
toronto_neighborhood = [('PostalCode' , postalCodeList),
                        ('Borough' , boroughList),
                        ('Neighborhood' , neighborhoodList)]
toronto_df = pd.DataFrame.from_dict(dict(toronto_neighborhood))
toronto_df.head()

In [None]:
#remove rows that are not assaigned 
toronto_df_dropna = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_df_dropna.head()

In [None]:
# grouping the neighborhoods according to Boroughs and PostalCode
toronto_df_grouped = toronto_df_dropna.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_df_grouped.head()

In [None]:
na_neigh_rows = toronto_df_grouped.Neighborhood == 'Not assigned'
toronto_df_grouped.loc[na_neigh_rows, 'Neighborhood'] = toronto_df_grouped.loc[na_neigh_rows, 'Borough']
toronto_df_grouped.head()

In [None]:
toronto_df_cleaned = toronto_df_grouped
toronto_df_cleaned.shape

# Part 2: Getting coordinates and add to the Toronto DataFrame

## Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. In an older version of this course, we were leveraging the Google Maps Geocoding API to get the latitude and the longitude coordinates of each neighborhood. However, recently Google started charging for their API: http://geoawesomeness.com/developers-up-in-arms-over-google-maps-api-insane-price-hike/, so we will use the Geocoder Python package instead: https://geocoder.readthedocs.io/index.html.

### The problem with this Package is you have to be persistent sometimes in order to get the geographical coordinates of a given postal code. So you can make a call to get the latitude and longitude coordinates of a given postal code and the result would be None, and then make the call again and you would get the coordinates.

### geolocator = Nominatim(user_agent="tl-toronto-neigh")

### postalList = toronto_df_cleaned['PostalCode'].values latList = [] longList = []

### for post in postalList: location = None while(location is None): location = geolocator.geocode('{}, Toronto, Ontario'.format(post)) if(location != None): lat = location.latitude long = location.longitude print(post, lat, long) latList.append(lat) longList.append(long)

### toronto_coors = [('Postal Code', postalList), ('Latitude', latList), ('Longitude', longList)] coors = pd.DataFrame.from_items(toronto_coors)

### Note: Given that this package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [None]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data
print('Coordinates downloaded!')
coors = pd.read_csv('toronto_coordinates.csv')


In [None]:
print(coors.shape)

In [None]:
coors.head()


In [None]:

toronto_df_temp = toronto_df_cleaned.set_index('PostalCode')
coors_temp = coors.set_index('Postal Code')
toronto_df_coors = pd.concat([toronto_df_temp , coors_temp] , axis = 1 , join='inner')


In [None]:

toronto_df_coors.index.name = 'PostalCode'
toronto_df_coors.reset_index(inplace = True)


In [None]:

print(toronto_df_coors.shape)
toronto_df_coors.head()


# Part 3: Explore and cluster the neighborhoods in Toronto

In [None]:

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent = "tl-toronto-neigh")
location = geolocator.geocode(address)
lati = location.latitude
longi = location.longitude
print(f"The co-ordinates of the location are {lati} , {longi}")


In [None]:

map_toronto = folium.Map(location = [lati,longi] , zoom_start = 11)

for lat,lon,post,borough,neigh in zip(toronto_df_coors['Latitude'], toronto_df_coors['Longitude'], toronto_df_coors['PostalCode'], toronto_df_coors['Borough'], toronto_df_coors['Neighborhood']) :
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label , parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius=8,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.4,
        parse_html=False).add_to(map_toronto)

map_toronto


In [None]:

toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = toronto_df_coors[toronto_df_coors['Borough'].isin(toronto_boroughs)].reset_index(drop = True)
print(toronto_central.shape)
toronto_central.head()


In [None]:

for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto


In [None]:

CLIENT_ID = 'XEIY3JJQY0QPPPG2MDYWUF11EPSEOHRZJBCJCCIFVJYBZ10O'
CLIENT_SECRET = 'LIANY3ONJGEYMTTUHOQSOS2S33Q4K4VABZPEYAMU2YDLR301'
VERSION = '20200412'


In [None]:

radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostalCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results : 
        venues.append((
        post,
        borough,
        neighborhood,
        lat,
        long,
        venue['venue']['name'],
        venue['venue']['location']['lat'], 
        venue['venue']['location']['lng'],  
        venue['venue']['categories'][0]['name']))


In [None]:

venues_df = pd.DataFrame(venues)
venues_df.columns =  ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()


In [None]:

venues_df.groupby(['PostalCode' , 'Borough' , 'Neighborhood'])['VenueName'].count()


In [None]:
len(venues_df['VenueCategory'].unique())
