# Notebook to explore and visualize neighborhoods

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library
import urllib.request
from bs4 import BeautifulSoup

print('Import done')

Import done


### Shortened code to create dataframe and append coordinates as in part 1 and 2 but without outputs

In [2]:
# URL/web page to be scraped to get ZIP codes of Canada
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")
right_table=soup.find('table', class_='wikitable sortable')

# Extract postal codes and corresponding boroughs and neighborhoods and create the desired pandas dataframe
Postcode=[]
Borough=[]
Neighborhood=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))
        
CA_postcodes = pd.DataFrame({'PostalCode': Postcode, 'Borough': Borough, 'Neighborhood': Neighborhood})

# Identify and drop entries with not assigned boroughs (and delete '\n'-extensions)
index_Notassigned = CA_postcodes[CA_postcodes['Borough'] == 'Not assigned'].index
CA_postcodes.drop(index_Notassigned, inplace=True)
CA_postcodes = CA_postcodes.replace('\n','', regex=True)

# Identify not assigned neighborhoods and replace 'Not assigned' with the corresponding borough
i = 0
for checker in CA_postcodes['Neighborhood']:
    i = i + 1
    if checker == 'Not assigned':
        CA_postcodes['Neighborhood'][i+1] = CA_postcodes['Borough'][i+1]
        
# Group neighborhoods to a single postal code and use the .shape-method to print the dimensions of the dataframe
final_df = CA_postcodes.groupby(['PostalCode','Borough'], sort=False)['Neighborhood'].apply(', '.join).to_frame(name = 'Neighborhood').reset_index()

print('The dataframe has',final_df.shape[0],'rows and',final_df.shape[1],'columns')
final_df.head(12)

The dataframe has 103 rows and 3 columns


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [3]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [4]:
import geocoder
# add latitude and longitude to pandas dataframe
final_df['Latitude'] = ""
final_df['Longitude'] = ""
cnt = 10

for i in range(len(final_df)):
    lat_lng_coords = None
    postal_code = final_df['PostalCode'][i]
    # loop until you get the coordinates
    while(lat_lng_coords is None):

        #g = geocoder.google('{M4B}, Toronto, Ontario'.format(postal_code))
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        final_df['Latitude'][i] = lat_lng_coords[0]
        final_df['Longitude'][i] = lat_lng_coords[1]
        
        # Progress checker for every ten updated entries
        if i % cnt == 0:
            print('Coordinates appended for',i,'of',len(final_df),'entries')        

final_df.head(12)

Coordinates appended for 0 of 103 entries
Coordinates appended for 10 of 103 entries
Coordinates appended for 20 of 103 entries
Coordinates appended for 30 of 103 entries
Coordinates appended for 40 of 103 entries
Coordinates appended for 50 of 103 entries
Coordinates appended for 60 of 103 entries
Coordinates appended for 70 of 103 entries
Coordinates appended for 80 of 103 entries
Coordinates appended for 90 of 103 entries
Coordinates appended for 100 of 103 entries


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7524,-79.3293
1,M4A,North York,Victoria Village,43.7304,-79.3133
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6551,-79.3626
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7231,-79.4516
4,M7A,Queen's Park,Queen's Park,43.6611,-79.391
5,M9A,Etobicoke,Islington Avenue,43.6622,-79.5284
6,M1B,Scarborough,"Rouge, Malvern",43.8115,-79.1955
7,M3B,North York,Don Mills North,43.7492,-79.3619
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.7075,-79.3118
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6574,-79.3782


### Create a map with latitude and longitude of 'postal code coordinates'

In [None]:
toronto_data = final_df
# create map of Toronto using latitude and longitude values
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# add markers to map
for lat, lng, borough, neighborhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

#### Define Foursquare Credentials and Version

##### I have created a json-file named 'secrets.json' with my credential in the form:
##### {
#####     "CLIENT_ID": "your_client_id",
#####     "CLIENT_SECRET": "your_client_secret",
#####     "VERSION": "your_version"
##### }
##### This has been saved in the directory of the notebook. To proceed, please create a similar json-file 
##### as described above with your credentials and proceed with the following cell calling the credentials:

In [None]:
secrets = json.load(open('secrets.json'))
CLIENT_ID = secrets['CLIENT_ID']
CLIENT_SECRET = secrets['CLIENT_SECRET']
VERSION = secrets['VERSION']

### Getting categories with the **get_category_type** function from the Foursquare lab.

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

## As in the example for NYC we explore the Neighborhoods in Toronto

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Use the getNearbyVenues-function on each neighborhood and create a the dataframe 'toronto_venues'.

In [None]:
limit = 100
radius = 500

toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

#### Check the number of venues for each neighborhood (postal code)

In [None]:
toronto_venues.groupby('Neighborhood').count()

#### Number of unique categories from all returned venues

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

## Analyze Each Neighborhood (postal code)

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe and move it to the first column
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# Neighborhood column to be moved to the first column
idx_nghbr = toronto_onehot.columns.get_loc("Neighborhood")

fixed_columns = [toronto_onehot.columns[idx_nghbr]] + list(toronto_onehot.columns[0:idx_nghbr]) + list(toronto_onehot.columns[idx_nghbr+1:len(toronto_onehot.columns)])
toronto_onehot = toronto_onehot[fixed_columns]

print('New size of the dataframe:',toronto_onehot.shape)
toronto_onehot.head()

#### Group rows by neighborhood (postal code) and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

print('Confirmation of the adapted size; new size:', toronto_grouped.shape)
toronto_grouped.head()

#### Output of the top 5 most common venues for each neighborhood

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

## 4. Cluster Neighborhoods
Run *k*-means to cluster the neighborhood into 5 clusters.

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, init='k-means++').fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged['Cluster Labels'].fillna(max(toronto_merged['Cluster Labels'])+1, inplace=True)   # unlabeled entries
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int64')

toronto_merged.head()

In [None]:
toronto_merged['Cluster Labels'].value_counts().sort_index()

Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

### Write down the number of cluster members for each cluster

In [None]:
toronto_merged['Cluster Labels'].value_counts().sort_index()

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1 - Food & Drinks

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 2 - Entertainment

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 3 - Car

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 4 - Park and spare time

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

#### Cluster 5 - Medics

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]