# Capstone Assignment : 
In this assignment, you will be required to explore, segment, and cluster the neighborhoods in the city of Toronto. However, unlike New York, the neighborhood data is not readily available on the internet. What is interesting about the field of data science is that each project can be challenging in its unique way, so you need to learn to be agile and refine the skill to learn new libraries and tools quickly depending on the project.

In [128]:
import sys
!{sys.executable} -m pip install bs4



In [130]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [182]:
# getting data from internet
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page, "html.parser")

In [243]:
debug = False
table = soup.find('table')

Postcode      = []
Borough       = []
Neighborhood = []


counter = 0
Postcode_var      = None
Borough_var       = None
Neighbourhood_var = None
# extracting a clean form of the table
for tr_cell in table.find_all('tr'):
    if counter == 0:
        counter = 1
        continue
        
    info = tr_cell.find_all('td')
    Postcode_var = info[0].text.strip()
    Borough_var = info[1].text.strip()
    Neighborhood_var = info[2].text.replace('/', ',').strip()
    

    if Borough_var == 'Not assigned':
        counter += 1
        continue
    
    if Neighbourhood_var == 'Not assigned':
        Neighborhood_var = Borough_var
    
    
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighborhood.append(Neighborhood_var)
    
    if debug: 
        print('counter :', counter)
        print('Postcode_var :',Postcode_var)
        print('Borough_var :', Borough_var)
        print('Neighborhood_var :',Neighborhood_var) 
        print('x'*40)
            
    counter += 1

# The wikipedia page has been modified to contain all the neighbourhoods in a single Postcode in a single cell seperated by a '/' so I have simply replaced '/' with a ','.

In [244]:
toronto_dict = {'Postcode':Postcode, 'Borough':Borough, 'Neighborhood':Neighborhood}
df_toronto = pd.DataFrame.from_dict(toronto_dict)

### To check if there are any repetitions of postcodes I compared the number of rows to the number of unique postcodes

In [245]:
print('Number of rows in the dataframe: {} Rows'.format(df_toronto.shape[0]))
print('Number of unique values in the dataframe: ')
df_toronto.Postcode.unique

Number of rows in the dataframe: 103 Rows
Number of unique values in the dataframe: 


<bound method Series.unique of 0      M3A
1      M4A
2      M5A
3      M6A
4      M7A
      ... 
98     M8X
99     M4Y
100    M7Y
101    M8Y
102    M8Z
Name: Postcode, Length: 103, dtype: object>

In [246]:
# checking correctness of the data
df_toronto[df_toronto['Postcode']=='M5G']

Unnamed: 0,Postcode,Borough,Neighborhood
24,M5G,Downtown Toronto,Central Bay Street


In [247]:
#shape of the dataframe: 
df_toronto.shape

(103, 3)

In [248]:
df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [189]:
#!conda install -c conda-forge geopy --yes 
#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



## creating a function to retrive latitude and longitude information

In [191]:
def getlatlog(postal_code):
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
    except:
        latitude = np.nan
        longitude = np.nan
    
    return latitude, longitude

In [156]:
latlist = []
lonlist = []
for each in df_toronto['Postcode']:
    lat, lon = getlatlog(each)
    latlist.append(lat)
    lonlist.append(lon)

In [170]:
print('Number of missing latitude data \n{}'.format(pd.DataFrame(latlist).isnull().sum()))
print('Number of missing longitude data \n{}'.format(pd.DataFrame(lonlist).isnull().sum()))

Number of missing latitude data 
0    78
dtype: int64
Number of missing longitude data 
0    78
dtype: int64


## Therefore we retrive the data using the Geospatial data provided in the csv file

In [249]:
geospatial_data = pd.read_csv("Geospatial_Coordinates.csv")

In [250]:
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [251]:
df_toronto['Latitude'] = np.nan
df_toronto['Longitude'] = np.nan
for each in df_toronto['Postcode']:
    df_toronto.loc[df_toronto['Postcode']==each, 'Latitude'] = geospatial_data[geospatial_data['Postal Code']==each]['Latitude'].values[0]
    df_toronto.loc[df_toronto['Postcode']==each,'Longitude'] = geospatial_data[geospatial_data['Postal Code']==each]['Longitude'].values[0]


In [252]:
df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509


In [253]:
# checking correctness of the data
df_toronto[df_toronto['Postcode']=='M5G']

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [307]:
neighborhoods = df_toronto.drop(columns='Postcode')

In [308]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [309]:
list(neighborhoods.Borough.unique())

['North York',
 'Downtown Toronto',
 'Etobicoke',
 'Scarborough',
 'East York',
 'York',
 'East Toronto',
 'West Toronto',
 'Central Toronto',
 'Mississauga']

## Retriving the location of Toronto

In [310]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [311]:
'''# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')'''

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


# Diplaying the neighborhoods on a folium map

In [313]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [314]:
downtown_data = neighborhoods[neighborhoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District , Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,Downtown Toronto,Berczy Park,43.644771,-79.373306


# Diplaying the neighborhoods on a folium map as just Downtown Toronto

In [315]:
address = 'Downtown Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

# create map of Manhattan using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

The geograpical coordinate of Manhattan are 43.6563221, -79.3809161.


In [316]:
CLIENT_ID = 'JKKLZNOSIQT4F3SOVNQY3O4UTGJZ3TVPBQKQB0KCT1KB5UTQ' # your Foursquare ID
CLIENT_SECRET = 'GDEGJUMGDBLS32KDF2QAIX4FCCHUGKQF3JFFXMM3XH5MMDN1' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JKKLZNOSIQT4F3SOVNQY3O4UTGJZ3TVPBQKQB0KCT1KB5UTQ
CLIENT_SECRET:GDEGJUMGDBLS32KDF2QAIX4FCCHUGKQF3JFFXMM3XH5MMDN1


In [317]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [318]:
downtown_venues = getNearbyVenues(names=downtown_data['Neighborhood'],
                                   latitudes=downtown_data['Latitude'],
                                   longitudes=downtown_data['Longitude']
                                  )

# Retriving the venues in downtown toronto 

In [None]:
downtown_venues.head()

In [320]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]


# One hot encoding and normalizing the data

In [321]:
downtown_onehot.head()
downtown_onehot.shape

(518, 145)

In [322]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Tea Room,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,"CN Tower , King and Spadina , Railway Lands , ...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
5,"Commerce Court , Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"First Canadian Place , Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.066667,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
7,"Garden District , Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033333,0.033333,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Harbourfront East , Union Station , Toronto Is...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0
9,"Kensington Market , Chinatown , Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.066667,0.033333


In [323]:
num_top_venues = 5

for hood in downtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.07
2  Seafood Restaurant  0.07
3            Beer Bar  0.07
4                Park  0.03


----CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport----
                 venue  freq
0      Airport Service  0.18
1       Airport Lounge  0.12
2     Airport Terminal  0.12
3  Rental Car Location  0.06
4                  Bar  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.23
1                Café  0.10
2  Italian Restaurant  0.07
3         Yoga Studio  0.03
4    Sushi Restaurant  0.03


----Christie----
           venue  freq
0  Grocery Store  0.24
1           Café  0.18
2           Park  0.12
3     Restaurant  0.06
4    Candy Store  0.06


----Church and Wellesley----
              venue  freq
0  Ramen Restaurant  0.03
1          Creperie  0.03
2   Bubble Tea Shop  0.03
3      Burger Joint  0.03


In [324]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Finding out the most popular locations in each neighborhood

In [325]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Seafood Restaurant,Cocktail Bar,Coffee Shop,Beer Bar,Bakery,Cheese Shop,Park,Comfort Food Restaurant,Breakfast Spot,Liquor Store
1,"CN Tower , King and Spadina , Railway Lands , ...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Sculpture Garden,Plane,Bar,Boat or Ferry,Rental Car Location,Boutique
2,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Yoga Studio,Spa,Gastropub,Discount Store,Hotel,Ice Cream Shop,Comic Shop
3,Christie,Grocery Store,Café,Park,Athletics & Sports,Diner,Nightclub,Candy Store,Coffee Shop,Baby Store,Italian Restaurant
4,Church and Wellesley,Burger Joint,Bookstore,Beer Bar,Indian Restaurant,Restaurant,Italian Restaurant,Japanese Restaurant,Ramen Restaurant,Pub,Pizza Place


## CLUSTER NEIGHBORS

In [326]:
from sklearn.cluster import KMeans

In [327]:
# set number of clusters
kclusters = 5
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 4, 3, 2, 1, 1, 1, 1, 4, 1], dtype=int32)

In [328]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_merged = downtown_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtown_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,3,Coffee Shop,Park,Breakfast Spot,Theater,Bakery,Yoga Studio,Dessert Shop,Pub,Café,Chocolate Shop
1,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,3,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,Mexican Restaurant,Sandwich Place,Burger Joint,Burrito Place,Café,Park
2,Downtown Toronto,"Garden District , Ryerson",43.657162,-79.378937,1,Café,Theater,Coffee Shop,Hotel,Plaza,Sandwich Place,Comic Shop,Restaurant,Ramen Restaurant,College Rec Center
3,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Café,Gastropub,Coffee Shop,Middle Eastern Restaurant,Ice Cream Shop,New American Restaurant,Hotel,Jazz Club,Diner,BBQ Joint
4,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Seafood Restaurant,Cocktail Bar,Coffee Shop,Beer Bar,Bakery,Cheese Shop,Park,Comfort Food Restaurant,Breakfast Spot,Liquor Store


# Displaying the clusters on a folium map

In [332]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters