![Header Image](https://cdn.iconscout.com/icon/free/png-256/python-12-555278.png "header")

# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

---
## Part 1: Creating the dataframe

In [45]:
import pandas as pd

# Set dataframe display size
desired_width = 320
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 10)

# Read table directly from Wikipedia using pandas inbuilt function
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [16]:
# Keep only Postal Codes with assigned Boroughs
df = df[df.Borough != 'Not assigned']

In [40]:
# Reset the index of the data frame
df.index = pd.RangeIndex(len(df.index))

# Check for duplicates
number_duplicates = df.duplicated(subset='Postal Code', keep='first').sum()
# Check for Borough with Neighbourhood Not assigned
number_not_assigned_neighbourhood = df[df.Neighbourhood == 'Not assigned'].shape[0]
print("There are {} duplicate Postal Codes in the dataframe and {} rows with no assigned Neighbourhood.".format(number_duplicates, number_not_assigned_neighbourhood))

There are 0 duplicate Postal Codes in the dataframe and 0 rows with no assigned Neighbourhood.


In [27]:
# Check result
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [28]:
# Check shape
df.shape

(103, 3)

---
## Part 2: Adding geographic coordinates to dataframe

In [26]:
# import geocoder

# # with geocoder: worked not even once
# for index, postal_code in zip(df.index, df['Postal Code']):
#     # initialize your variable to None
#     lat_lng_coords = None
#
#     # loop until you get the coordinates
#     while lat_lng_coords is None:
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng
#
#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
#     df.loc[index, 'Latitude'] = latitude
#     df.loc[index, 'Longitude'] = longitude

# df.head(10)

In [30]:
# with csv file

df_latlong = pd.read_csv("http://cocl.us/Geospatial_data")
print(df_latlong.head())
for index, postal_code in zip(df.index, df['Postal Code']):

    lat_lng_coords = df_latlong.loc[df_latlong['Postal Code'] == postal_code, ['Latitude', 'Longitude']]
    df.loc[index, 'Latitude'] = lat_lng_coords.iloc[0, 0]
    df.loc[index, 'Longitude'] = lat_lng_coords.iloc[0, 1]

df.head(10)

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


---
## Part 3: Exploring and clustering the neighborhoods in Toronto

In [None]:
import requests # library to handle requests
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
import geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# import k-means from clustering stage
from sklearn.cluster import KMeans

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML

# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
    
# tranforming json file into a pandas dataframe library
# ! conda install -c anaconda pandas
# from pandas import json_normalize
import json
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: \ 

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

In [None]:
#create map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_torontoll['Latitude'], df_torontoll['Longitude'], df_torontoll['Borough'], df_torontoll['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [None]:
#select all boroughs that contain the word Toronto
df_bor_toronto=df_torontoll[df_torontoll['Borough'].str.contains('Toronto')].reset_index(drop=True)
print(df_bor_toronto.shape)
df_bor_toronto.head(10)

In [None]:
#explore the 5th row neighborhood - The Beaches
neighborhood_latitude = df_bor_toronto.loc[4, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_bor_toronto.loc[4, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_bor_toronto.loc[4, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

In [None]:
# explore all neighborhoods in boroughs that contain the word "Toronto"


# fuction to repeat the same process as with The Beaches to all neighborhoods in boroughs that contain the word "Toronto"
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
borough_Toronto_venues = getNearbyVenues(names=df_bor_toronto['Neighborhood'],
                                   latitudes=df_bor_toronto['Latitude'],
                                   longitudes=df_bor_toronto['Longitude']
                                  )

In [None]:
print(borough_Toronto_venues.shape) #the size of the resulting dataframe
borough_Toronto_venues.head()

In [None]:
# check how many venues were returned for each neighborhood
borough_Toronto_venues.groupby('Neighborhood').count()

In [None]:
# find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(borough_Toronto_venues['Venue Category'].unique())))

In [None]:
# analyse each neighborhood in boroughs that contain the word "Toronto"

# one hot encoding
borough_Toronto_onehot = pd.get_dummies(borough_Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
borough_Toronto_onehot['Neighborhood'] = borough_Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [borough_Toronto_onehot.columns[-1]] + list(borough_Toronto_onehot.columns[:-1])
borough_Toronto_onehot = borough_Toronto_onehot[fixed_columns]

borough_Toronto_onehot.head()

In [None]:
borough_Toronto_onehot.shape

In [None]:
# let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

borough_Toronto_grouped = borough_Toronto_onehot.groupby('Neighborhood').mean().reset_index()
borough_Toronto_grouped.head(10)

In [None]:
borough_Toronto_grouped.shape

In [None]:
# print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in borough_Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = borough_Toronto_grouped[borough_Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# turn into a pandas dataframe

# a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# create the new dataframe and display the top 10 venues for each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = borough_Toronto_grouped['Neighborhood']

for ind in np.arange(borough_Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(borough_Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# Cluster Neighborhoods
# Run k-means to cluster the neighborhood into 5 clusters
# set number of clusters
kclusters = 5

borough_Toronto_grouped_clustering = borough_Toronto_grouped.drop('Neighborhood', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(borough_Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

# add clustering labels

neighborhoods_venues_sorted.insert(0,"Cluster Labels",kmeans.labels_)

In [None]:
neighborhoods_venues_sorted.head()

In [None]:
borough_Toronto_merged = df_bor_toronto.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
borough_Toronto_merged = borough_Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

borough_Toronto_merged.head()

In [None]:
# visualize the resulting clusters

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_Toronto_merged['Latitude'], borough_Toronto_merged['Longitude'], borough_Toronto_merged['Neighborhood'], borough_Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine each cluster

In [None]:
#CLUSTER 1

borough_Toronto_merged.loc[borough_Toronto_merged['Cluster Labels'] == 0, borough_Toronto_merged.columns[[1] + list(range(5, borough_Toronto_merged.shape[1]))]]

In [None]:
#CLUSTER 2

borough_Toronto_merged.loc[borough_Toronto_merged['Cluster Labels'] == 1, borough_Toronto_merged.columns[[1] + list(range(5, borough_Toronto_merged.shape[1]))]]

In [None]:
#CLUSTER 3

borough_Toronto_merged.loc[borough_Toronto_merged['Cluster Labels'] == 2, borough_Toronto_merged.columns[[1] + list(range(5, borough_Toronto_merged.shape[1]))]]

In [None]:
#CLUSTER 4

borough_Toronto_merged.loc[borough_Toronto_merged['Cluster Labels'] == 3, borough_Toronto_merged.columns[[1] + list(range(5, borough_Toronto_merged.shape[1]))]]


In [None]:
#CLUSTER 5

borough_Toronto_merged.loc[borough_Toronto_merged['Cluster Labels'] == 4, borough_Toronto_merged.columns[[1] + list(range(5, borough_Toronto_merged.shape[1]))]]

## The END

### Thank you