 #                     IBM APPLIED Data Science Capstone Project

#      Segmenting and Clustering Neighborhoods in Toronto


In [40]:
import numpy as np                        # library to handle data in a vectorized manner

import pandas as pd                       # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json                               # library to handle JSON files

!conda install -c conda-forge geopy --yes           # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim               # convert an address into latitude and longitude values

import requests                                    # library to handle requests
from pandas.io.json import json_normalize                 # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes          # uncomment this line if you haven't completed the Foursquare API lab
import folium                                             # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


#   WebScraping Toronto Boroughs & Pin-Codes.


Install Beautiful Soup Package & Continue....

Extract the WikiTable with 287 rows.

In [41]:
from bs4 import BeautifulSoup
import requests
import csv

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

table=soup.table.tbody

table_df=pd.DataFrame()

for string in table.stripped_strings:
    table_df=table_df.append([string])

Borough_df=pd.DataFrame(table_df.to_numpy().reshape(-1,3))
Borough_df.columns = Borough_df.iloc[0]
Borough_df=Borough_df.drop(0).reset_index(drop=True)
Borough_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


### Now that we have the required table. Let us remove 'Not assigned' [210 rows remain]  and Data Wrangling
After Wrangling, we are left with 103 unique postal codes...!

In [42]:
#Remove 'NA' Borough, if Neighbourhood="NA", use Borough value
Borough_df=Borough_df[Borough_df['Borough']!='Not assigned'].reset_index(drop=True)

Borough_df=Borough_df.replace('Not assigned',np.NaN)
Borough_df["COL3"] = Borough_df["Neighbourhood"].fillna(Borough_df["Borough"])
Borough_df=Borough_df.drop(['Neighbourhood'],axis=1).reset_index(drop=True)
Borough_df. rename (columns={'COL3':'Neighbourhood'}, inplace=True)

#Group by Postcode & Borough with Concatenation of Neighbourhood Vales

Grouped_df=Borough_df.groupby(['Postcode','Borough'],as_index=False).agg(', '.join)
print(Grouped_df.head())

print('\n \n The dataframe has {} boroughs \n and \n {} ROWS .'.format(
        len(Grouped_df['Borough'].unique()),
        Grouped_df.shape[0]))

0 Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Rouge, Malvern
1      M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough       Guildwood, Morningside, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae

 
 The dataframe has 10 boroughs 
 and 
 103 ROWS .


## Fetch Location data from csv file that has the geographical coordinates

In [43]:
loc_df = pd.read_csv ('http://cocl.us/Geospatial_data')
loc_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [44]:
# Merge the Location Data with the Neighbourhood Data
Tor_df = pd.merge(Grouped_df, loc_df, left_on='Postcode', right_on='Postal Code', how='left')
Tor_df=Tor_df.drop(['Postal Code'],axis=1).reset_index(drop=True)
Tor_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Create a map of Toronto Neighbourhoods using FOLIUM

#### Use Geopy to get Location of Toronto Neighbourhoods

#### Create Neighbourhood Map of Toronto

In [45]:
# Get the Latitude & Longitude Data for Toronto City
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


## Select only Toronto Boroughs

In [46]:
NH_df=Tor_df[Tor_df['Borough'].str.contains('Toronto')].sort_values (by = 'Neighbourhood').drop(['Postcode'],axis=1).reset_index(drop=True)
NH_df

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
1,Downtown Toronto,Berczy Park,43.644771,-79.373306
2,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191
3,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
4,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442
5,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
6,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049
8,Downtown Toronto,Christie,43.669542,-79.422564
9,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


In [47]:
# create map of Toronto with Neighbourhoods using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(NH_df['Latitude'], NH_df['Longitude'], NH_df['Borough'], NH_df['Neighbourhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

Enter the Fourquare credentials

In [48]:
CLIENT_ID = 'GQYZTHDW4FELZGYMP0DIQXVDFGXO5MMAVZATGRXRFKCPJV4X' # your Foursquare ID
CLIENT_SECRET = 'IWPETGS0XZIYMQBDYOZO2QRWDE0UJFHOBRA1YPTL1PK3F4SL' # your Foursquare Secret
VERSION = '20190202'
LIMIT = 100
print('Your credentails have been entered')


Your credentails have been entered


# TEST if we can obtain data from Fourquare:

### Get Restaurants data from FourSquare Toronto City [ 43.653963, -79.387207]

In [49]:
search_query = 'Restaurant'  #['Restaurant','School','Bar','Medical','Shop']
radius = 500


url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'\
.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)

results = requests.get(url).json()


In [50]:
# Extract the Venues List from JSON and store in DF
venues = results['response']['venues']

# tranform venues into a dataframe
rest_df = json_normalize(venues)
rest_df=rest_df[['name','location.lat','location.lng']]
rest_df.head()

Unnamed: 0,name,location.lat,location.lng
0,Hemispheres Restaurant & Bistro,43.654884,-79.385931
1,Some Time BBQ Grill Restaurant 碳烤屋,43.655874,-79.393826
2,Hong Shing Chinese Restaurant,43.654925,-79.387089
3,Tundra Restaurant,43.65001,-79.385608
4,Cali Restaurant,43.655068,-79.386375


In [51]:

rest_df['name'].count()

40

###### There are 40 restaurants within 500m of Toronto City Center.

========================================================================================================================================================================

# CLUSTERING NEIGHBOURHOODS OF TORONTO

### PLAN:
### Toronto is among the world's most livable cities.   Check:  https://www.businessinsider.in/The-50-most-livable-cities-in-the-world-in-2018/T-7-Toronto-Canada/slideshow/65454301.cms
### 1) Cluster Neighbourhoods to find out the most LIVABLE neighbourhoods

### 2) For Every Neighbourhood , Calculate "Lifestyle Score" & "Convenience Score"
### 3) Lifestyle Score  =  Number of Bars and Restaurants [Higher the number, higher the social life residents]
### 4) Conveninece Score  =   Number of Schools & Shops [Higher the number of these venues, the more convenient it is to live]

### 5) Finally cluster and map the Neighbourhoods.

#### Get the Venue Counts for all Neighbourhoods [39] .
#### For this repetitive process, Create a user-defined function to Get Venues & Store them in Data Frame !

In [52]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'\
.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, search, radius, LIMIT)

            
        # make the GET request
        results = requests.get(url).json()
        venues = results['response']['venues']

           
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            v['name']) for v in venues])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                              'Venue Count']
    
    return(nearby_venues)

#### Use the user-defined function to get Data for all Neighbourhoods.
### Get Restaurant count for all Neighbourhoods

In [53]:
search = 'Restaurant'
Tor_venues1 = getNearbyVenues(names=NH_df['Neighbourhood'],
                                   latitudes=NH_df['Latitude'],
                                   longitudes=NH_df['Longitude']
                                  )
Tor_venues1=Tor_venues1.sort_values (by = 'Neighbourhood').reset_index(drop=True)
Tor_venues1. rename (columns={'Venue Count':'Restaurants'}, inplace=True)
Tor_Group1=Tor_venues1.groupby(['Neighbourhood'],as_index=False).count()

Tor_Group1.head()

Unnamed: 0,Neighbourhood,Restaurants
0,"Adelaide, King, Richmond",42
1,Berczy Park,13
2,"Brockton, Exhibition Place, Parkdale Village",4
3,"Cabbagetown, St. James Town",8
4,Central Bay Street,41


### Lets us now get counts for Bars,Shops & Schools

In [54]:
search = 'Bar'
Tor_venues2 = getNearbyVenues(names=NH_df['Neighbourhood'],
                                   latitudes=NH_df['Latitude'],
                                   longitudes=NH_df['Longitude']
                                  )
Tor_venues2=Tor_venues2.sort_values (by = 'Neighbourhood').reset_index(drop=True)
Tor_venues2. rename (columns={'Venue Count':'Bars'}, inplace=True)
Tor_Group2=Tor_venues2.groupby(['Neighbourhood'],as_index=False).count()

Tor_Group2.head()

Unnamed: 0,Neighbourhood,Bars
0,"Adelaide, King, Richmond",50
1,Berczy Park,45
2,"Brockton, Exhibition Place, Parkdale Village",10
3,Business Reply Mail Processing Centre 969 Eastern,3
4,"Cabbagetown, St. James Town",7


In [55]:
search = 'Shop'
Tor_venues3 = getNearbyVenues(names=NH_df['Neighbourhood'],
                                   latitudes=NH_df['Latitude'],
                                   longitudes=NH_df['Longitude']
                                  )
Tor_venues3=Tor_venues3.sort_values (by = 'Neighbourhood').reset_index(drop=True)
Tor_venues3. rename (columns={'Venue Count':'Shops'}, inplace=True)
Tor_Group3=Tor_venues3.groupby(['Neighbourhood'],as_index=False).count()

Tor_Group3.head()

Unnamed: 0,Neighbourhood,Shops
0,"Adelaide, King, Richmond",50
1,Berczy Park,24
2,"Brockton, Exhibition Place, Parkdale Village",5
3,Business Reply Mail Processing Centre 969 Eastern,2
4,"Cabbagetown, St. James Town",9


In [56]:
search = 'School'
Tor_venues4 = getNearbyVenues(names=NH_df['Neighbourhood'],
                                   latitudes=NH_df['Latitude'],
                                   longitudes=NH_df['Longitude']
                                  )
Tor_venues4=Tor_venues4.sort_values (by = 'Neighbourhood').reset_index(drop=True)
Tor_venues4. rename (columns={'Venue Count':'Schools'}, inplace=True)
Tor_Group4=Tor_venues4.groupby(['Neighbourhood'],as_index=False).count()

Tor_Group4.head()

Unnamed: 0,Neighbourhood,Schools
0,"Adelaide, King, Richmond",12
1,Berczy Park,7
2,"Brockton, Exhibition Place, Parkdale Village",3
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",1
4,"Cabbagetown, St. James Town",6


### Merge Bars & Restaurants data to group based on "LIFESTYLE SCORE"

In [57]:
#Merge Totonto Neighbouhoods with Restaurants DF and Again merge with Bars DF

Tor_Lifestyle = pd.merge(pd.merge(NH_df, Tor_Group1, left_on='Neighbourhood', right_on='Neighbourhood', how='left')\
                         ,Tor_Group2, left_on='Neighbourhood', right_on='Neighbourhood', how='left')
Tor_Lifestyle=Tor_Lifestyle.fillna(0)
Tor_Lifestyle.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Restaurants,Bars
0,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,42.0,50.0
1,Downtown Toronto,Berczy Park,43.644771,-79.373306,13.0,45.0
2,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,4.0,10.0
3,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0.0,3.0
4,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,0.0,0.0


### Merge Shops & Schools data to group based on "Convenience SCORE"

In [58]:
#Merge Totonto Neighbouhoods with Shops DF and Again merge with Schools DF

Tor_Convenience = pd.merge(pd.merge(NH_df, Tor_Group3, left_on='Neighbourhood', right_on='Neighbourhood', how='left')\
                         ,Tor_Group4, left_on='Neighbourhood', right_on='Neighbourhood', how='left')
Tor_Convenience=Tor_Convenience.fillna(0)
Tor_Convenience.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Shops,Schools
0,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,50.0,12.0
1,Downtown Toronto,Berczy Park,43.644771,-79.373306,24.0,7.0
2,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,5.0,3.0
3,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,2.0,0.0
4,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,0.0,1.0


## CLUSTER the neighbourhoods based on Lifestyle offerings provided by them.

In [59]:
from sklearn.preprocessing import StandardScaler
cluster_lifestyle = StandardScaler().fit_transform(Tor_Lifestyle[['Restaurants','Bars']])

kclusters = 4
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cluster_lifestyle)

# check cluster labels generated for each row in the dataframe
ls_labels=kmeans.labels_
ls_labels[0:10]

array([0, 2, 1, 1, 1, 1, 0, 0, 1, 2], dtype=int32)

In [63]:
# add clustering labels

Tor_Lifestyle["Lifestyle Labels"] = ls_labels
Tor_Lifestyle=Tor_Lifestyle.sort_values (by = 'Lifestyle Labels').reset_index(drop=True)
Tor_Lifestyle.head(10)

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Restaurants,Bars,Lifestyle Labels
0,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,42.0,50.0,0
1,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,8.0,47.0,0
2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,18.0,50.0,0
3,Central Toronto,Roselawn,43.711695,-79.416936,0.0,0.0,0
4,Downtown Toronto,"First Canadian Place, Underground city",43.648429,-79.38228,42.0,50.0,0
5,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,46.0,50.0,0
6,Central Toronto,North Toronto West,43.715383,-79.405678,2.0,8.0,0
7,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,0.0,0.0,0
8,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0.0,3.0,1
9,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,0.0,0.0,1


In [61]:
Lifestyle_labels=Tor_Lifestyle[['Restaurants','Bars','Lifestyle Labels']].groupby('Lifestyle Labels').mean()
Lifestyle_labels
            

Unnamed: 0_level_0,Restaurants,Bars
Lifestyle Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,42.875,48.0
1,2.434783,5.173913
2,17.0,48.4
3,8.666667,21.0


## Map the clustered Neighbourhoods for Lifestylescore.


In [62]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Tor_Lifestyle['Latitude'], Tor_Lifestyle['Longitude'], Tor_Lifestyle['Neighbourhood'], Tor_Lifestyle['Lifestyle Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## As you can see Downtown Toronto has the best Lifestyle Label accourding to our K-Means Algorithm