# First Let's import required libraries

In [31]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize  # transform json files to pandas dataframes
from geopy.geocoders import Nominatim # 
import numpy as np
import csv
!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

print('All modules imported')

All modules imported


Let's start scraping the wikipedia page

In [32]:
# The wikipedia site link
site_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Get the source code html data from the website

In [33]:
source = requests.get(site_link).text

Lets Use BeautifulSoup to parse it

In [34]:
soup = BeautifulSoup(source, 'lxml')

#print(soup.prettify())

Next let's get the table that contains the data we want to scrape

In [35]:
My_table = soup.find('table',{'class':'wikitable sortable'})

Let's view the table data

In [36]:
# Uncomment below to view table
# My_table

we can see that all the data we want are between the $&lt;td&gt;$ brackets, let's get the data between the td brackets

In [37]:
links = My_table.find_all('td')

In [38]:
# uncomment below to view links
# print(links)

Next let's loop through links and extract only the text elements

In [39]:
text_links = []

for link in links:
    text_links.append(link.text)
    
# uncommnet below to view text_links    
#text_links

let's clean and process the table elements
Let's clean the links and keep only rows with Borough. Out of which we shall rename rows without Neighborhood as Boroughs

In [40]:
cleaned_links = []

while True:
    
    if len(text_links) < 3:
        break
    
    sub = text_links[:3]
    # If 'Not ' in borough then skip that row of data
    if 'Not ' in sub[1]:
        text_links = text_links[3:]
    else:
        cleaned_links.append(text_links[:3])
        
        # Let's strip off the \n at the end of each neighborhood data
        cleaned_links[-1][-1] = cleaned_links[-1][-1].strip('\n')
        
        # If the Borough is available but the Neighborhood is missing
        # make Neighborhood same as Borough
        if 'Not ' in cleaned_links[-1][-1]:
            cleaned_links[-1][-1] = cleaned_links[-1][-2]
        text_links = text_links[3:]
# Uncomment below to view cleaned_links       
#cleaned_links

lets check the length of the cleaned links

In [41]:
len(cleaned_links)

103

Next let's add the neighborhood data of each duplicate Postal Codes together to the first instance or row that contains the PostalCode

In [42]:
link = []
for i in range(len(cleaned_links)):
    x = cleaned_links[i][0]
    if x in link:
        cleaned_links[link.index(x)][-1] += ', ' + cleaned_links[i][-1]
    link.append(x)
    
# uncomment below
#cleaned_links

Next let's pass the cleaned _links to a data frame and set index to postal code so that we can easily work on it

In [43]:
df = pd.DataFrame(cleaned_links, columns=['PostalCode','Borough','Neighborhood'])
df.index= df.PostalCode

In [44]:
# Let's view the data frame
df.head(7)

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A\n,M3A\n,North York\n,Parkwoods
M4A\n,M4A\n,North York\n,Victoria Village
M5A\n,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
M6A\n,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
M7A\n,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"
M9A\n,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village"
M1B\n,M1B\n,Scarborough\n,"Malvern, Rouge"


Next let's Use the pandas duplicate method to drop duplicate index

In [45]:
df = df.loc[~df.index.duplicated(keep='first')]

In [46]:
# Let's see the shape so far
df.shape

(103, 3)

In [47]:
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A\n,M3A\n,North York\n,Parkwoods
M4A\n,M4A\n,North York\n,Victoria Village
M5A\n,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
M6A\n,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
M7A\n,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"


Next let's reset the index back and drop the current index

In [48]:
df.reset_index(drop=True, inplace=True)

# Let's see the first few rows
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods
1,M4A\n,North York\n,Victoria Village
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"
5,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village"
6,M1B\n,Scarborough\n,"Malvern, Rouge"
7,M3B\n,North York\n,Don Mills
8,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens"
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson"


In [49]:
df.shape

(103, 3)

## Appending the Latitude and Longitude data
Let's define a simple method that we can apply to each Borough to get its Latitude and Longitude using the apply() method

In [50]:
def latitude_longitude(Borough):
    """ Method takes a Series object and returns
    a list of Latitude and corresponding Longitude data,
    using the geopy library.
    This method also prints out the coordinate data"""
    
    address = Borough
    
    geolocator = Nominatim(user_agent="CA_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    return [latitude, longitude]

Let's append the list containing corresponding lat and lon data to column Latitude

In [51]:
df['Latitude'] = df.Borough.apply(latitude_longitude)

The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of Etobicoke
 are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough
 are 54.2820009, -0.4011868.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of East York
 are 43.699971000000005, -79.33251996261595.
The geograpical coordinate of Downtown Toronto
 are 43.6541737, -79.38081164513409.
The geograpical coordinate of North York
 are 43.7543263, -79.44911696639593.
The geograpical coordinate of Etobicoke
 are 43.6435559, -79.5656326.
The geograpical coordinate of Scarborough
 are 54.2820009, -0.

In [22]:
# Lets see the updated data with Latitude containing lists of lats and lons data

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude
0,M3A\n,North York\n,Parkwoods,"[43.7543263, -79.44911696639593]"
1,M4A\n,North York\n,Victoria Village,"[43.7543263, -79.44911696639593]"
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront","[43.6541737, -79.38081164513409]"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights","[43.7543263, -79.44911696639593]"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government","[43.6541737, -79.38081164513409]"


Next let's loop through the data frame and separate Latitude from Longitude and make lat and lons just numbers not lists

In [52]:
lon_list = []
for i, j in df.iterrows():
    lon_list.append(j.Latitude[1])
    j.Latitude = j.Latitude[0]
    
# next let's assign the lon_list as the value of the Longitude Column

df['Longitude'] = lon_list

In [53]:
# let's view the changes

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A\n,North York\n,Parkwoods,43.7543,-79.449117
1,M4A\n,North York\n,Victoria Village,43.7543,-79.449117
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront",43.6542,-79.380812
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights",43.7543,-79.449117
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
5,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village",43.6436,-79.565633
6,M1B\n,Scarborough\n,"Malvern, Rouge",54.282,-0.401187
7,M3B\n,North York\n,Don Mills,43.7543,-79.449117
8,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens",43.7,-79.33252
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson",43.6542,-79.380812


In [54]:
# Let's print the shape of the data frame

df.shape

(103, 5)

### EDA of Toronto Neighborhood Clusters
Let's explore and cluster the neighborhoods in Toronto. We can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did earlier to the New York City data.

In [55]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A\n,North York\n,Parkwoods,43.7543,-79.449117
1,M4A\n,North York\n,Victoria Village,43.7543,-79.449117
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront",43.6542,-79.380812
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights",43.7543,-79.449117
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
5,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village",43.6436,-79.565633
6,M1B\n,Scarborough\n,"Malvern, Rouge",54.282,-0.401187
7,M3B\n,North York\n,Don Mills,43.7543,-79.449117
8,M4B\n,East York\n,"Parkview Hill, Woodbine Gardens",43.7,-79.33252
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson",43.6542,-79.380812


Let's see the unique Borough names we have in the Data Frame

In [27]:
df.Borough.unique()

array(['North York\n', 'Downtown Toronto\n', 'Etobicoke\n',
       'Scarborough\n', 'East York\n', 'York\n', 'East Toronto\n',
       'West Toronto\n', 'Central Toronto\n', 'Mississauga\n'],
      dtype=object)

Let's loop through the data frame and get the index of Boroughs that end with 'Toronto\n'

In [56]:
toronto_list = [i for i in df.index if df.iloc[i,1].endswith('Toronto\n')]

Let's slice through the data frame to select only these rows of Boroughs ending with Toronto

In [57]:
Toronto_df = df.iloc[toronto_list,].reset_index(drop=True)

let's visualize the first 10 rows

In [58]:
Toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront",43.6542,-79.380812
1,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government",43.6542,-79.380812
2,M5B\n,Downtown Toronto\n,"Garden District, Ryerson",43.6542,-79.380812
3,M5C\n,Downtown Toronto\n,St. James Town,43.6542,-79.380812
4,M4E\n,East Toronto\n,The Beaches,43.6248,-79.393492
5,M5E\n,Downtown Toronto\n,Berczy Park,43.6542,-79.380812
6,M5G\n,Downtown Toronto\n,Central Bay Street,43.6542,-79.380812
7,M6G\n,Downtown Toronto\n,Christie,43.6542,-79.380812
8,M5H\n,Downtown Toronto\n,"Richmond, Adelaide, King",43.6542,-79.380812
9,M6H\n,West Toronto\n,"Dufferin, Dovercourt Village",43.6535,-79.383935


Lets see the unique Boroughs once again in Toronto

In [59]:
Toronto_df.Borough.unique()

array(['Downtown Toronto\n', 'East Toronto\n', 'West Toronto\n',
       'Central Toronto\n'], dtype=object)

Let's get the geographic coordinates of Toronto

In [60]:
address = 'Toronto\n'

geolocator = Nominatim(user_agent='CA_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinates of Toronto
 are 43.6534817, -79.3839347.


let's visualize Toronto and the neighborhoods in it.

In [61]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto  # Feel free to zoom in to see more

Let's explore the data using foursquare API

In [62]:
CLIENT_ID = 'HHLYJJN4PDQWIOGD2VBZMYZPB2CJ0BLIDOKMOKVSQ5W2BIZV' # your Foursquare ID
CLIENT_SECRET = 'D32R0IJGPGEDTAI3KCHLYYJGZGB4T2ALXEA0ON0KNEZ0K0FY' # your Foursquare Secret
VERSION = '20190727' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HHLYJJN4PDQWIOGD2VBZMYZPB2CJ0BLIDOKMOKVSQ5W2BIZV
CLIENT_SECRET:D32R0IJGPGEDTAI3KCHLYYJGZGB4T2ALXEA0ON0KNEZ0K0FY


Let's confirm the shape of Toronto_df

In [63]:
Toronto_df.shape  # This shows there are 39 rows and 5 columns

(39, 5)

Next let's see the unique borough names in Toronto_df

In [64]:
Toronto_df.Borough.unique()

array(['Downtown Toronto\n', 'East Toronto\n', 'West Toronto\n',
       'Central Toronto\n'], dtype=object)

## Okay so lets explore the first location in 'Downtown Toronto'
Let's get the details of the first entry

In [65]:
first_entry_Downtown_Toronto = Toronto_df[Toronto_df.Borough =='Downtown Toronto\n'].head(1)

first_entry_Downtown_Toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront",43.6542,-79.380812


Now, let's get the top 100 venues that are in the first entry within a radius of 500 meters.
First let's create the get request url

In [66]:
radius = 500
LIMIT=100

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    first_entry_Downtown_Toronto.Latitude[0], 
    first_entry_Downtown_Toronto.Longitude[0], 
    radius, 
    LIMIT)

Send the GET request and examine the resutls

In [67]:
result = requests.get(url).json()

result.keys()

dict_keys(['meta', 'response'])

In [68]:
# Uncomment below to view the result json data
# result

function that extracts the category of the venue

In [69]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [70]:
venues = result['response']['groups'][0]['items']

# Uncomment below to view the venues data
# venues

In [71]:
nearby_venues = json_normalize(venues) # flatten JSON

  if __name__ == '__main__':


In [72]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.distance']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,distance
0,Elgin And Winter Garden Theatres,Theater,43.653394,-79.378507,204
1,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641,193
2,Indigo,Bookstore,43.653515,-79.380696,73
3,Ed Mirvish Theatre,Theater,43.655102,-79.379768,133
4,LUSH,Cosmetics Shop,43.653557,-79.3804,76


And how many venues were returned by Foursquare?

In [73]:
print('The total number of venues returned is {}'.format(len(nearby_venues)))

The total number of venues returned is 96


Okay so lets explore venues in Central Toronto

In [74]:
central_toronto_df = Toronto_df[Toronto_df.Borough == 'Central Toronto\n']

central_toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
18,M4N\n,Central Toronto\n,Lawrence Park,43.6535,-79.383935
19,M5N\n,Central Toronto\n,Roselawn,43.6535,-79.383935
20,M4P\n,Central Toronto\n,Davisville North,43.6535,-79.383935
21,M5P\n,Central Toronto\n,"Forest Hill North & West, Forest Hill Road Park",43.6535,-79.383935
23,M4R\n,Central Toronto\n,"North Toronto West, Lawrence Park",43.6535,-79.383935
24,M5R\n,Central Toronto\n,"The Annex, North Midtown, Yorkville",43.6535,-79.383935
26,M4S\n,Central Toronto\n,Davisville,43.6535,-79.383935
29,M4T\n,Central Toronto\n,"Moore Park, Summerhill East",43.6535,-79.383935
31,M4V\n,Central Toronto\n,"Summerhill West, Rathnelly, South Hill, Forest...",43.6535,-79.383935


Let's create a function to repeat the same process to all the neighborhoods in Central Toronto

In [75]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'],
            v['venue']['location']['distance']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Venue Distance']
    
    return(nearby_venues)

In [77]:
central_toronto_venues = getNearbyVenues(names=central_toronto_df['Neighborhood'],
                                   latitudes=central_toronto_df['Latitude'],
                                   longitudes=central_toronto_df['Longitude']
                                  )

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


Let's check the size of the resulting dataframe

In [78]:
print(central_toronto_venues.shape)

central_toronto_venues.head()

(666, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Distance
0,Lawrence Park,43.653482,-79.383935,Downtown Toronto,43.653232,-79.385296,Neighborhood,113
1,Lawrence Park,43.653482,-79.383935,Nathan Phillips Square,43.65227,-79.383516,Plaza,138
2,Lawrence Park,43.653482,-79.383935,Japango,43.655268,-79.385165,Sushi Restaurant,222
3,Lawrence Park,43.653482,-79.383935,Indigo,43.653515,-79.380696,Bookstore,260
4,Lawrence Park,43.653482,-79.383935,Poke Guys,43.654895,-79.385052,Poke Place,181


Let's check how many venues were returned per neighborhood

In [79]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Davisville,74,74,74,74,74,74,74
Davisville North,74,74,74,74,74,74,74
"Forest Hill North & West, Forest Hill Road Park",74,74,74,74,74,74,74
Lawrence Park,74,74,74,74,74,74,74
"Moore Park, Summerhill East",74,74,74,74,74,74,74
"North Toronto West, Lawrence Park",74,74,74,74,74,74,74
Roselawn,74,74,74,74,74,74,74
"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",74,74,74,74,74,74,74
"The Annex, North Midtown, Yorkville",74,74,74,74,74,74,74


Let's find out how many unique categories can be curated from all the returned venues

In [80]:
print('There are {} unique categories of Venues'.format(central_toronto_venues['Venue Category'].nunique()))

There are 55 unique categories of Venues


## Analyze Each Neighborhood

In [81]:
# one hot encoding
central_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

In [82]:
# add neighborhood column back to dataframe
central_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood']

In [83]:
# move neighborhood column to the first column
fixed_columns = [central_toronto_onehot.columns[-1]] + list(central_toronto_onehot.columns[:-1])
central_toronto_onehot = central_toronto_onehot[fixed_columns]

pd.set_option('display.max_columns', None)
central_toronto_onehot.head()

Unnamed: 0,Vietnamese Restaurant,American Restaurant,Art Museum,Bank,Bookstore,Breakfast Spot,Bubble Tea Shop,Burger Joint,Café,Clothing Store,Cocktail Bar,Coffee Shop,Colombian Restaurant,Comic Shop,Concert Hall,Cosmetics Shop,Department Store,Diner,Electronics Store,Fast Food Restaurant,Food Court,Furniture / Home Store,Gastropub,Gym / Fitness Center,Hotel,Latin American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,Neighborhood,New American Restaurant,Noodle House,Office,Opera House,Pizza Place,Plaza,Poke Place,Ramen Restaurant,Restaurant,Salad Place,Seafood Restaurant,Shoe Store,Shopping Mall,Smoothie Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Lawrence Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Let's examine the new Data frame size

In [84]:
central_toronto_onehot.shape

(666, 55)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [85]:
central_toronto_grouped = central_toronto_onehot.groupby('Neighborhood').mean().reset_index()
central_toronto_grouped

Unnamed: 0,Neighborhood,Vietnamese Restaurant,American Restaurant,Art Museum,Bank,Bookstore,Breakfast Spot,Bubble Tea Shop,Burger Joint,Café,Clothing Store,Cocktail Bar,Coffee Shop,Colombian Restaurant,Comic Shop,Concert Hall,Cosmetics Shop,Department Store,Diner,Electronics Store,Fast Food Restaurant,Food Court,Furniture / Home Store,Gastropub,Gym / Fitness Center,Hotel,Latin American Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Music Venue,New American Restaurant,Noodle House,Office,Opera House,Pizza Place,Plaza,Poke Place,Ramen Restaurant,Restaurant,Salad Place,Seafood Restaurant,Shoe Store,Shopping Mall,Smoothie Shop,Steakhouse,Sushi Restaurant,Tanning Salon,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store
0,Davisville,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
1,Davisville North,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
2,"Forest Hill North & West, Forest Hill Road Park",0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
3,Lawrence Park,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
4,"Moore Park, Summerhill East",0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
5,"North Toronto West, Lawrence Park",0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
6,Roselawn,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514
8,"The Annex, North Midtown, Yorkville",0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.067568,0.013514,0.081081,0.013514,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.040541,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.013514,0.013514,0.027027,0.013514,0.027027,0.013514,0.013514,0.013514,0.013514,0.013514,0.013514,0.027027,0.027027,0.013514,0.013514


In [86]:
central_toronto_grouped.shape

(9, 55)

Let's print each neighborhood along with the top 5 most common venues

In [87]:
num_top_venues = 5

for hood in central_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = central_toronto_grouped[central_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.07
2           Hotel  0.04
3      Restaurant  0.03
4           Plaza  0.03


----Davisville North----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.07
2           Hotel  0.04
3      Restaurant  0.03
4           Plaza  0.03


----Forest Hill North & West, Forest Hill Road Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.07
2           Hotel  0.04
3      Restaurant  0.03
4           Plaza  0.03


----Lawrence Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.07
2           Hotel  0.04
3      Restaurant  0.03
4           Plaza  0.03


----Moore Park, Summerhill East----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.07
2           Hotel  0.04
3      Restaurant  0.03
4           Plaza  0.03


----North Toronto West,  Lawrence Park----
            venue  freq
0     Coffee Shop  0.08
1  Clothing Store  0.

Lets put this into a Pandas Data frame

first let's write a function tp sort the venues in descending order

In [88]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Next let's create the new Data frame and display the Top 10 venues for each neighborhood

In [89]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = central_toronto_grouped['Neighborhood']

for ind in np.arange(central_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(central_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
1,Davisville North,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
2,"Forest Hill North & West, Forest Hill Road Park",Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
3,Lawrence Park,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
4,"Moore Park, Summerhill East",Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant


#### Cluster Neighborhoods

In [90]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

Run k-means to cluster the neighborhood into 5 clusters.

In [91]:
# set number of clusters
kclusters = 5

c_toronto_grouped_clustering = central_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(c_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]



array([0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [92]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

c_toronto_merged = central_toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
c_toronto_merged = c_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

c_toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,M4N\n,Central Toronto\n,Lawrence Park,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
19,M5N\n,Central Toronto\n,Roselawn,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
20,M4P\n,Central Toronto\n,Davisville North,43.6535,-79.383935,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
21,M5P\n,Central Toronto\n,"Forest Hill North & West, Forest Hill Road Park",43.6535,-79.383935,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
23,M4R\n,Central Toronto\n,"North Toronto West, Lawrence Park",43.6535,-79.383935,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant


Finally, let's visualize the resulting clusters

In [93]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(c_toronto_merged['Latitude'], c_toronto_merged['Longitude'], c_toronto_merged['Neighborhood'], c_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

##### Examine Clusters
Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.
Cluster 1

In [94]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 0, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
18,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
19,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
20,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
21,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
23,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
24,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
26,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
29,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant
31,Central Toronto\n,0,Coffee Shop,Clothing Store,Hotel,Seafood Restaurant,Diner,Cosmetics Shop,Restaurant,Plaza,Theater,American Restaurant


Cluster Two

In [95]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 1, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster Three

In [96]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 2, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster Four

In [97]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 3, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster Five

In [98]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 4, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
