## Assignment: Segmenting and Clustering Neighborhoods in Toronto - Part 3
### Problem: Explore and cluster the neighborhoods in Toronto.

In [1]:
# Check if these libraries are installed before importing
#!conda install -c conda-forge geopy --yes
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c conda-forge matplotlib --yes

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


### Scrape, clean and organize data into a dataframe

In [1]:
# Create a dataframe from the wikipedia table
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find("table", class_ = "wikitable sortable")
postcodes=[]
boroughs=[]
neighborhoods=[]
for row in table.findAll("tr"):
    cells = row.findAll("td")
    if len(cells)==3: #Only extract table body not heading
        postcodes.append(cells[0].find(text=True))
        boroughs.append(cells[1].find(text=True))
        neighborhoods.append(cells[2].find(text=True))
# Use lists to create the dataframe
df=pd.DataFrame(postcodes,columns=["PostalCode"])
df["Borough"]=boroughs
df["Neighborhood"]=neighborhoods
df = df[df["Borough"]!="Not assigned"]
# create a grouped dataframe by Postalcode
df = df.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
df = df.reset_index()
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
# Clean the Neighborhood text by removing new lines "\n"
df = df.replace('\n','', regex=True)

NameError: name 'requests' is not defined

In [None]:
# An attempt to use the geocoder library but it was too slow and no results displayed
'''import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format("M5G"))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
print(latitude, longitude)'''

In [None]:
import io
url="https://cocl.us/Geospatial_data"
s=requests.get(url).content
df_new=pd.read_csv(io.StringIO(s.decode('utf-8')))
df_new.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
print(df_new.head())

In [None]:
# Create a new dataframe containig both information from the first dataframe (Postalcode, Borough and Neighborhood)
# and from the second one (Latitude and Longitude)
df = df.merge(df_new, on="PostalCode", how = 'inner')
print(df_merged.head())

### Create a map for toronto and all the neighborhoods

In [None]:
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Specify the crendentials for Foursquare API

In [None]:
CLIENT_ID = '4GEA3GGCIKPQB05J5V5TZNP3PZR3NEM4MX555QBHJS5NYVNW' # your Foursquare ID
CLIENT_SECRET = 'GDDGEMWMAOVWF2SWVFN0KVFUKIWFYED33EXVSAQNQP05ILQQ' # your Foursquare Secret
VERSION = '20200310' # Foursquare API version

# defining radius and limit of venues to get
radius=500
LIMIT=100

### Create a functions allowing exploring venues

In [None]:
# write a function to get all nearby venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot.head()

### Analyze Neighborhoods and Venues

In [None]:
top_venues_num = 8

for n in toronto_grouped['Neighborhood']:
    print("\n----"+n+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == n].T.reset_index()
    temp.columns = ['venue','frequency']
    temp = temp.iloc[1:]
    temp['frequency'] = temp['frequency'].astype(float)
    temp = temp.round({'frequency': 2})
    print(temp.sort_values('frequency', ascending=False).reset_index(drop=True).head(top_venues_num))

In [None]:
# Create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

In [166]:
# Write a function to sort the venues in descending order.

def get_n_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [169]:
# Create the new dataframe and display the top 5 venues for each neighborhood.

import numpy as np
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbor_venue_sorted = pd.DataFrame(columns=columns)
neighbor_venue_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbor_venue_sorted.iloc[ind, 1:] = get_n_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbor_venue_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Restaurant,Café,Thai Restaurant,Bar
1,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Doner Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Yoga Studio,Distribution Center,Deli / Bodega
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Discount Store,Japanese Restaurant,Beer Store
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pub,Gym,Skating Rink


### Use of KMeans clustering for the clsutering the neighbourhoods and Analyze each cluster

In [173]:
# set number of clusters
k = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
#neighbourhoods_venues_sorted.insert(0, 'Cluster_Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Park,Bus Stop,Food & Drink Shop,Yoga Studio,Distribution Center,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Intersection,Coffee Shop,Hockey Arena,Portuguese Restaurant,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1.0,Coffee Shop,Bakery,Park,Café,Pub,Theater,Mexican Restaurant,Restaurant,Breakfast Spot,Hotel
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,1.0,Clothing Store,Accessories Store,Furniture / Home Store,Women's Store,Coffee Shop,Miscellaneous Shop,Boutique,Event Space,Vietnamese Restaurant,Eastern European Restaurant
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1.0,Coffee Shop,Park,Burger Joint,Yoga Studio,Beer Bar,Deli / Bodega,Bar,Café,Portuguese Restaurant,Fast Food Restaurant


In [176]:
toronto_merged=toronto_merged.dropna()

In [177]:
toronto_merged['Cluster_Labels'] = toronto_merged.Cluster_Labels.astype(int)

In [187]:
for i in [0, 1, 2, 3, 4]:
    print('\nNumber of items in the cluster', i, '= ',
        len(toronto_merged.loc[toronto_merged['Cluster_Labels'] == i, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]))


Number of items in the cluster 0 =  17

Number of items in the cluster 1 =  64

Number of items in the cluster 2 =  12

Number of items in the cluster 3 =  3

Number of items in the cluster 4 =  3


In [180]:
# create a map to visualize the different clusters
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'],
                                           toronto_merged['Neighborhood'], toronto_merged['Cluster_Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters