## Segmenting and Clustering Neighborhoods in Toronto
### IBM Capstone Assignment

## 1. Creating Dataframe

In [149]:
# importing libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors
print('Libraries imported.')

ModuleNotFoundError: No module named 'folium'

In [11]:
# importing website as text file (HTML code)
Wiki_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
HTML = requests.get(Wiki_url).text

In [9]:
# Creating dataframe
soup = BeautifulSoup(HTML, 'xml')
table=soup.find('table')
column_names=['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

#### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [111]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
        df.head()
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### 1.1 Cleaning Dataframe 

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [112]:
# cleaning dataframe - deleting rows with 'Borough' not assigned 
indexNames = df[ df['Borough'] =='Not assigned'].index
df.drop(indexNames , inplace=True)
indexNames2 = df[ df['Postalcode'] =="Queen's Park" ].index
df.drop(indexNames2 , inplace=True)
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [113]:
# cleaning dataframe - Replacing 'Not Assigned' Neighborhood with Borough names
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### More than one neighborhood can exist in one postal code area. These rows will be combined into one row with the neighborhoods separated with a comma

In [129]:
# cleaning dataframe - Merging rows with same postal code
df1= df.groupby('Postalcode').agg(lambda x: ','.join(x))
df1['Borough']= df1['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")
df1['Neighborhood']= df1['Neighborhood'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")
df2=df1.reset_index()
df2.head(20)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"HighlandCreek,PortUnion,RougeHill"
2,M1E,Scarborough,"Guildwood,WestHill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,ScarboroughVillage
6,M1K,Scarborough,"KennedyPark,Ionview,EastBirchmountPark"
7,M1L,Scarborough,"Oakridge,GoldenMile,Clairlea"
8,M1M,Scarborough,"Cliffside,Cliffcrest,ScarboroughVillageWest"
9,M1N,Scarborough,"CliffsideWest,BirchCliff"


#### Print the number of rows of your dataframe.

In [130]:
df2.shape

(102, 3)

## 2. Creating dataframe with Latitudes & Logitudes

In [132]:
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude
# Importing geo data
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()
# finding latitudes & Longitudes
geo_df.rename(columns={'Postal Code':'Postalcode'},inplace=True)
geo_merged = pd.merge(geo_df, df2, on='Postalcode')
# creating dataframe with latitudes & Longitudes
geo_data=geo_merged[['Postalcode','Borough','Neighborhood','Latitude','Longitude']]
geo_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"HighlandCreek,PortUnion,RougeHill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,WestHill,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 3. Exploring and clustering the neighborhoods in Toronto.

In [133]:
# filter dataframew for only Toronto
toronto_data=geo_data[geo_data['Borough'].str.contains("Toronto")]
toronto_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
37,M4E,EastToronto,TheBeaches,43.676357,-79.293031
41,M4K,EastToronto,"Riverdale,TheDanforthWest",43.679557,-79.352188
42,M4L,EastToronto,"IndiaBazaar,TheBeachesWest",43.668999,-79.315572
43,M4M,EastToronto,StudioDistrict,43.659526,-79.340923
44,M4N,CentralToronto,LawrencePark,43.72802,-79.38879


In [134]:
# calling foursquare API
CLIENT_ID = 'QEF4SFROUUVOQKNPAFNNRDUW4ACAWSYYG312LM3BKEDDKPIZ' 
CLIENT_SECRET = 'CCBFW1WDVTUYDLPFX3SR0QBB5R2UKRYOZ3F1JDJ3PLQMPDRM'
VERSION = '20180604'

In [135]:
def getNearbyVenues(names, latitudes, longitudes):
    radius=500
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [136]:
# exploring venues
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

TheBeaches
Riverdale,TheDanforthWest
IndiaBazaar,TheBeachesWest
StudioDistrict
LawrencePark
DavisvilleNorth
NorthTorontoWest
Davisville
MoorePark,SummerhillEast
SummerhillWest,ForestHillSE,SouthHill,DeerPark,Rathnelly
Rosedale
Cabbagetown,St.JamesTown
ChurchandWellesley
Harbourfront
GardenDistrict,Ryerson
St.JamesTown
BerczyPark
CentralBayStreet
Adelaide,Richmond,King
HarbourfrontEast,TorontoIslands,UnionStation
TorontoDominionCentre,DesignExchange
CommerceCourt,VictoriaHotel
Roselawn
ForestHillNorth,ForestHillWest
Yorkville,TheAnnex,NorthMidtown
Harbord,UniversityofToronto
KensingtonMarket,Chinatown,GrangePark
CNTower,RailwayLands,HarbourfrontWest,BathurstQuay,SouthNiagara,KingandSpadina,Islandairport
StnAPOBoxes25TheEsplanade
Undergroundcity,FirstCanadianPlace
Christie
DovercourtVillage,Dufferin
Trinity,LittlePortugal
Brockton,ParkdaleVillage,ExhibitionPlace
HighPark,TheJunctionSouth
Parkdale,Roncesvalles
Runnymede,Swansea
Queen'sPark
BusinessReplyMailProcessingCentre969Eastern


In [138]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,TheBeaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,TheBeaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,TheBeaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
3,TheBeaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,TheBeaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub


In [139]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,Richmond,King",100,100,100,100,100,100
BerczyPark,56,56,56,56,56,56
"Brockton,ParkdaleVillage,ExhibitionPlace",24,24,24,24,24,24
BusinessReplyMailProcessingCentre969Eastern,18,18,18,18,18,18
"CNTower,RailwayLands,HarbourfrontWest,BathurstQuay,SouthNiagara,KingandSpadina,Islandairport",17,17,17,17,17,17
"Cabbagetown,St.JamesTown",44,44,44,44,44,44
CentralBayStreet,83,83,83,83,83,83
Christie,18,18,18,18,18,18
ChurchandWellesley,85,85,85,85,85,85
"CommerceCourt,VictoriaHotel",100,100,100,100,100,100


In [140]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.drop(['Neighborhood'],axis=1,inplace=True) 
toronto_onehot.insert(loc=0, column='Neighborhood', value=toronto_venues['Neighborhood'] )
toronto_onehot.shape

(1704, 232)

In [141]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,"Adelaide,Richmond,King",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.01,0.0
1,BerczyPark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton,ParkdaleVillage,ExhibitionPlace",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333
3,BusinessReplyMailProcessingCentre969Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
4,"CNTower,RailwayLands,HarbourfrontWest,Bathurst...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [143]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Bar,Steakhouse,Asian Restaurant,Restaurant,Burger Joint,Hotel,Thai Restaurant,Cosmetics Shop
1,BerczyPark,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Steakhouse,Bakery,Farmers Market,Cheese Shop,Café,Liquor Store
2,"Brockton,ParkdaleVillage,ExhibitionPlace",Café,Yoga Studio,Breakfast Spot,Coffee Shop,Grocery Store,Pet Store,Performing Arts Venue,Italian Restaurant,Intersection,Gym / Fitness Center
3,BusinessReplyMailProcessingCentre969Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Butcher,Restaurant,Brewery
4,"CNTower,RailwayLands,HarbourfrontWest,Bathurst...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Bar,Plane,Boat or Ferry,Harbor / Marina,Sculpture Garden,Airport Gate


### Clustering Neighborhood

In [144]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [145]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,EastToronto,TheBeaches,43.676357,-79.293031,0,Park,Other Great Outdoors,Pub,Health Food Store,Trail,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
41,M4K,EastToronto,"Riverdale,TheDanforthWest",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Lounge,Pub,Indian Restaurant,Sports Bar,Spa
42,M4L,EastToronto,"IndiaBazaar,TheBeachesWest",43.668999,-79.315572,0,Sandwich Place,Gym,Pub,Liquor Store,Burger Joint,Burrito Place,Italian Restaurant,Fast Food Restaurant,Steakhouse,Fish & Chips Shop
43,M4M,EastToronto,StudioDistrict,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Bakery,Brewery,Italian Restaurant,American Restaurant,Yoga Studio,Comfort Food Restaurant,Seafood Restaurant
44,M4N,CentralToronto,LawrencePark,43.72802,-79.38879,4,Park,Swim School,Bus Line,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


In [146]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,"Adelaide,Richmond,King",Coffee Shop,Café,Bar,Steakhouse,Asian Restaurant,Restaurant,Burger Joint,Hotel,Thai Restaurant,Cosmetics Shop
1,0,BerczyPark,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Steakhouse,Bakery,Farmers Market,Cheese Shop,Café,Liquor Store
2,0,"Brockton,ParkdaleVillage,ExhibitionPlace",Café,Yoga Studio,Breakfast Spot,Coffee Shop,Grocery Store,Pet Store,Performing Arts Venue,Italian Restaurant,Intersection,Gym / Fitness Center
3,0,BusinessReplyMailProcessingCentre969Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Spa,Burrito Place,Farmers Market,Fast Food Restaurant,Butcher,Restaurant,Brewery
4,0,"CNTower,RailwayLands,HarbourfrontWest,Bathurst...",Airport Service,Airport Lounge,Airport Terminal,Boutique,Bar,Plane,Boat or Ferry,Harbor / Marina,Sculpture Garden,Airport Gate


In [147]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [150]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'folium' is not defined

#### Unable to import folium. So Maps are not available