## Install required packages
<hr>
First of all, let's install some packages to scrape Wikipedia page.

In [None]:
#install beautifulsoup4
!pip install beautifulsoup4
#install lxml
!pip install lxml
#install html5lib
!pip install html5lib
#install requests
!pip install requests

## Web scraping and datafile creation
<hr>

Now we will scrape wikipedia web page to get the table. Then, we create a csv file that contains extracted data. The file is called `Toronto_neighborhood.csv`and does not contain rows where Borough is not assigned.



In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
#get table source code
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())

Let's transform the table to a csv file

In [4]:
#parse table and create csv file 
import csv
table = soup.find('table', class_='wikitable sortable')
table_body = table.find('tbody')
i=0
with open('Toronto_neighborhood.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['PostalCode', 'Borough', 'Neighborhood'])

    rows = table_body.find_all('tr')
    for row in rows[1:]:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        i+=1
        if cols[1] != 'Not assigned':#when Borough is not assigned
            csv_writer.writerow(cols)
print(i)

287


## CSV file preprocessing
<hr>
In this section we will combine Neighborhood that corresponds to the same Postal code. Then, Neighborhood that is not assigned will be the same as Borough.

In [5]:
import pandas as pd
toronto_df = pd.read_csv('Toronto_neighborhood.csv')

In [6]:
toronto_df.shape

(210, 3)

In [7]:
# create a table that contains combined Neighborhood
combined_df = toronto_df.groupby(['PostalCode'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()

In [8]:
# Merge the two tables to obtain the combined table
toronto_df = pd.merge(toronto_df[['PostalCode', 'Borough']].drop_duplicates(), combined_df, on='PostalCode', how='inner')

In [9]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [10]:
#Replace Not assigned Neighborhood with Borough
toronto_df = toronto_df.apply(lambda x: x.replace(x['Neighborhood'],x['Borough'] if (x['Neighborhood']=='Not assigned') else x['Neighborhood']) , axis=1)

In [11]:
toronto_df.shape

(103, 3)

## Add Longitude and lattitude data
<hr>
In this section we will use another csv file to get geospatial data.

In [13]:
#read csv file for Geospatial coordinates
geo_coord = pd.read_csv('Geospatial_Coordinates.csv')

In [14]:
geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Let's merge the two dataframes (toronto_df and geo_coord) to get the final dataframe.

In [15]:
#Merge toronto_df and geo_coord dataframe. Then delete 'Postal Code' column
toronto_df = pd.merge(toronto_df, geo_coord, how='left', left_on='PostalCode', right_on='Postal Code')
toronto_df.drop('Postal Code', axis=1, inplace=True)

In [16]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [17]:
#save result dataframe.
toronto_df.to_csv('Totonto_data.csv', index=False)

## Explore neighborhood in Toronto
<hr>
Let's first make of Toronto with different neighborhoods.

In [None]:
#install required packages
!pip install geopy
!pip install folium

Let's explore the city of Toronto by making a map with markers to different neighborhoods*.

\* It's based on latitude and longitude data which may correspond to more than one neighborhood.

In [19]:
# Let's get latitude and longitude of Toronto

from geopy.geocoders import Nominatim

address = 'Toronto, CA'

geolocator = Nominatim(user_agent='dc_user')
location = geolocator.geocode(address, timeout=10)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [20]:
# create map of Toronto using latitude and longitude values
import folium

toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

To analyze different neighborhoods, we need to get data about venues in every neighborhood.

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Let's get data about all venues in Toronto. We will extract venues informations for every neighborhood and put it in a dataframe called `venues_df`. 

In [25]:
#Create Venue dataframe
venues_df = pd.DataFrame(columns=['PostalCode', 'VenueName', 'VenueCategory', 'VenueLatitude', 'VenueLongitude'])

for idx, pCode in enumerate(toronto_df.PostalCode.values):
    neighborhood_latitude = toronto_df.loc[idx, 'Latitude']
    neighborhood_longitude = toronto_df.loc[idx, 'Longitude']
    #Form url
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            neighborhood_latitude, 
            neighborhood_longitude
            )
    #get results
    results = requests.get(url).json()

    venues = results['response']['groups'][0]['items']

    nearby_venues = pd.io.json.json_normalize(venues) # flatten JSON
    # filter columns
    filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode']
    nearby_venues =nearby_venues.loc[:, filtered_columns]

    # filter the category for each row
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

    # clean columns
    nearby_venues = nearby_venues.rename(columns = {'venue.name': 'VenueName',
                                   'venue.categories': 'VenueCategory',
                                   'venue.location.lat': 'VenueLatitude',
                                   'venue.location.lng': 'VenueLongitude',
                                   'venue.location.postalCode': 'VenuePostalCode'})
    nearby_venues['PostalCode'] = pCode
    venues_df = venues_df.append(nearby_venues, ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [26]:
print(venues_df.shape)
venues_df.head()

(3090, 6)


Unnamed: 0,PostalCode,VenueCategory,VenueLatitude,VenueLongitude,VenueName,VenuePostalCode
0,M3A,Caribbean Restaurant,43.75984,-79.324719,Allwyn's Bakery,M3A 1Z5
1,M3A,Golf Course,43.752816,-79.342741,Donalda Golf & Country Club,M3A 2Z7
2,M3A,Event Space,43.763923,-79.342961,Graydon Hall Manor,M3A 3B4
3,M3A,Gym / Fitness Center,43.747665,-79.347077,LA Fitness,M3B 2X2
4,M3A,Supermarket,43.75352,-79.349518,Galleria Supermarket,M3B 1Y5


We wanted to have all available results, so we didn't use neither a limit nor a radius. Therefore, we need to check for duplicate data.

In [27]:
venues_df.duplicated(subset=['VenueCategory', 'VenueLatitude', 'VenueLongitude', 'VenueName'], keep=False).sum()

2130

**2130 duplicates!!** So we need to make a lot of cleaning. We need to associate every duplicate venue to the nearest neighborhood. We will first merge toronto_df with venues_df, so we can get longitude and latitude for neighborhoods.

### Delete duplicate venues 
<hr>

Let's first get unique list of duplicate venues.

In [28]:
#duplicate contains the first duplicate of every duplicate venue
duplicate = venues_df[venues_df.duplicated(subset=['VenueCategory', 'VenueLatitude', 'VenueLongitude', 'VenueName'], keep=False)]
duplicate = duplicate.drop_duplicates(subset=['VenueCategory', 'VenueLatitude', 'VenueLongitude', 'VenueName'], keep='first')

In [29]:
duplicate.isnull().sum()

PostalCode           0
VenueCategory        0
VenueLatitude        0
VenueLongitude       0
VenueName            0
VenuePostalCode    186
dtype: int64

We can't use only postal codes from API since there is 186 null values in duplicate. So we proceed as follows: For every venue, if its VenuePostalCode is null, then we will associate it to the nearest neighborhood. Otherwise, we will extract the first part of its postal code.

In [30]:
#this function returns all duplicates for every venue
import numpy as np
def get_duplicates(lat, lng, name, df):
    return df[(np.isclose(df['VenueLatitude'], lat)) & (np.isclose(df['VenueLongitude'], lng)) & (df['VenueName']==name)]

Now let's write a function that calculates the distance `as the crow flies` between two points and chooses the nearest one from a list of points. Then we will replace neighborhood in duplicates with the nearest one and then drop all duplicates.  

In [32]:
from scipy.spatial import distance

def get_nearest(source, destinations):
    '''
    source: tuple of the source
    destinations: list of tuples of destinations
    
    returns a tuple of the nearest destination
    '''
    minimum = 10
    nearest = destinations[0]
    for idx, pc in enumerate(destinations):
        geo = geo_coord[geo_coord['Postal Code'] == pc][['Latitude', 'Longitude']]
        d = distance.euclidean(source, geo)
        if d < minimum:
            nearest = pc
        
    return nearest, d

In [33]:
#create a duplicate_final where postal codes for duplicate venues are fixed 
duplicate_final = pd.DataFrame(columns=venues_df.columns[1:])
for idx, venue in duplicate.iterrows():
    temp_dup = get_duplicates(venue['VenueLatitude'], venue['VenueLongitude'], venue['VenueName'], venues_df)
    #if not null: 
    if pd.notna(temp_dup.VenuePostalCode.values[0]):
        v_pcode = venue['VenuePostalCode'].split()[0]
        
    else: #if nan
        source = (venue['VenueLatitude'], venue['VenueLongitude'])
        #possible_geocoords = geo_coord[geo_coord['Postal Code'].isin(temp_dup['PostalCode'])]
        destinations= temp_dup['PostalCode'].values
        v_pcode, _ = get_nearest(source, destinations)
        
    row = venue[1:].copy()
    row['VenuePostalCode']= v_pcode
    duplicate_final = duplicate_final.append(row, ignore_index=True)

In [34]:
duplicate_final.head()

Unnamed: 0,VenueCategory,VenueLatitude,VenueLongitude,VenueName,VenuePostalCode
0,Caribbean Restaurant,43.75984,-79.324719,Allwyn's Bakery,M3A
1,Golf Course,43.752816,-79.342741,Donalda Golf & Country Club,M3A
2,Event Space,43.763923,-79.342961,Graydon Hall Manor,M3A
3,Gym / Fitness Center,43.747665,-79.347077,LA Fitness,M3B
4,Supermarket,43.75352,-79.349518,Galleria Supermarket,M3B


In [35]:
#drop duplicate from venues_df
venues_df = venues_df.drop_duplicates(subset=['VenueCategory', 'VenueLatitude', 'VenueLongitude', 'VenueName'], keep=False)


In [36]:
venues_df.isnull().sum()

PostalCode           0
VenueCategory        0
VenueLatitude        0
VenueLongitude       0
VenueName            0
VenuePostalCode    207
dtype: int64

Now we need to replace null values (venuesPostalcode)in venues_df(non duplicate) with postalcode.

In [37]:
#replace Nan VenuePostalCode with PostalCode

venues_df = venues_df.apply(lambda x: x.fillna(x['PostalCode']), axis=1)

In [38]:
venues_df = venues_df.apply(lambda x: x.replace(x['VenuePostalCode'], x['VenuePostalCode'].split()[0]), axis=1)

In [39]:
venues_df = venues_df.drop(['PostalCode'], axis=1)

Let's add duplicate venues to venues_df

In [41]:
venues_df = venues_df.append(duplicate_final, ignore_index=True)

In [42]:
venues_df.VenuePostalCode.unique()

array(['M3A', 'M4A', 'M5A', 'M5R', 'M6A', 'M4Y', 'M5G', 'M5S', 'M1B',
       'M3B', 'M4B', 'M4C', 'M5B', 'M6B', 'M9B', 'M1C', 'L1V', 'M3C',
       'M5C', 'M6C', 'M6G', 'M9C', 'L4X', 'L4W', 'M1E', '4551', 'M4E',
       'M5E', 'M6E', 'M1G', 'M4G', 'M5T', 'M6H', 'M1H', 'M1P', 'L3R',
       'M2H', 'L3T', 'M2J', 'M3H', 'M5M', 'M4J', 'M5H', 'M5V', 'M1J',
       'M3J', 'L4K', 'M5J', 'M6J', 'M1K', 'M3K', 'M4K', 'm4k1n2', 'M6K',
       'M1L', 'M2K', 'M4L', 'M5L', 'M6M', 'M9L', 'L4L', 'M9M', 'M1M',
       'M1N', 'M2M', 'M4M', 'M4N', 'M9W', 'M1l3y1', 'M2N', 'M3N', 'M2L',
       'M6N', 'M5P', 'M4R', 'M5N', 'M6P', 'M9P', 'M1R', 'M2R', 'L4J',
       'M4P', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M6S', 'M1T', 'M4T',
       'M4W', 'M1V', 'M4V', 'M8V', 'M8Z', 'M9V', 'L4H', 'M1W', 'M4X',
       'M8W', 'L4V', 'L5P', 'L6B', 'L3S', 'M1X', 'M8X', 'M7Y', 'M8Y',
       'M37', 'M4H', 'M3M', 'M6L', 'M4Y-1Z2', 'M9A', 'M1s4n6', 'M431E6',
       'M5W', 'M5X', 'M2P', 'M5K', 'M9N', 'M3L', 'L1L'], dtype=object)

Let's correct erroneous postal codes

In [43]:
#delete postal codes that begin with L
venues_df = venues_df[~venues_df['VenuePostalCode'].str.startswith('L')]

In [44]:
venues_df.replace({'m4k1n2': 'M4K', 'M1l3y1': 'M1L', 'M4Y-1Z2': 'M4Y', 'M1s4n6': 'M1S', 'M431E6': 'M4A' }, inplace=True)

In [49]:
venues_df = venues_df[~(venues_df['VenuePostalCode']=='4551')]

Let's get neighboor and borough names for every postal code.

In [53]:
venues_df = venues_df.merge(toronto_df, left_on='VenuePostalCode', right_on='PostalCode')[['PostalCode', 'Borough', 'Neighborhood', 'VenueName', 'VenueCategory', 'VenueLatitude', 'VenueLongitude']]

In [54]:
venues_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,VenueName,VenueCategory,VenueLatitude,VenueLongitude
0,M3A,North York,Parkwoods,Brookbanks Park,Park,43.751976,-79.33214
1,M3A,North York,Parkwoods,David Duncan House,Steakhouse,43.758259,-79.348886
2,M3A,North York,Parkwoods,A&W Canada,Fast Food Restaurant,43.760643,-79.326865
3,M3A,North York,Parkwoods,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
4,M3A,North York,Parkwoods,Donalda Golf & Country Club,Golf Course,43.752816,-79.342741


Now that we have the final dataframe, let's explore how many venues there are in every neighborhood.

In [55]:
venues_df.groupby('Neighborhood')[['VenueLatitude', 'VenueLongitude', 'VenueName']].count()

Unnamed: 0_level_0,VenueLatitude,VenueLongitude,VenueName
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Adelaide,King,Richmond",27,27,27
Agincourt,24,24,24
"Agincourt North,L'Amoreaux East,Milliken,Steeles East",16,16,16
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",8,8,8
"Alderwood,Long Branch",12,12,12
...,...,...,...
Willowdale West,9,9,9
Woburn,2,2,2
"Woodbine Gardens,Parkview Hill",8,8,8
Woodbine Heights,15,15,15


Let's see how many categories can we find in all venues.

In [56]:
print('There are {} categories'.format(len(venues_df['VenueCategory'].unique())))

There are 246 categories


## Analyze Each Neighborhood
<hr>
We will first create one hot representation of venues categories for venues.

In [85]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']],prefix='', prefix_sep='')

# add postalCode column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Neighborhood'] = venues_df['Neighborhood'] 

# move PostalCode column to the first column
fixed_columns = ['PostalCode', 'Neighborhood'] + toronto_onehot.columns[:-2].tolist()
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,PostalCode,Neighborhood,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Aquarium,Art Gallery,Arts & Crafts Store,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,Parkwoods,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's get the mean occurrence of venues categories for every venue.

In [86]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.142857
1,M1C,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
2,M1E,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
3,M1G,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
4,M1H,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,M9N,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.142857,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
98,M9P,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.000000
99,M9R,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
100,M9V,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000


Now let's create a profile for every venue.

In [90]:
num_top_venues = 5

for code in toronto_grouped['PostalCode']:
    print("----"+venues_df[venues_df['PostalCode']==code]['Neighborhood'].iloc[0]+"----")
    temp = toronto_grouped[toronto_grouped['PostalCode'] == code].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Rouge,Malvern----
           venue  freq
0            Zoo  0.14
1       Pharmacy  0.14
2   Dessert Shop  0.14
3     Campground  0.14
4  National Park  0.14


----Highland Creek,Rouge Hill,Port Union----
                venue  freq
0           BBQ Joint  0.12
1  Athletics & Sports  0.12
2                Park  0.12
3               Beach  0.12
4      Cosmetics Shop  0.12


----Guildwood,Morningside,West Hill----
                 venue  freq
0  Fried Chicken Joint   0.1
1        Smoothie Shop   0.1
2          Coffee Shop   0.1
3         Burger Joint   0.1
4           Sports Bar   0.1


----Woburn----
               venue  freq
0  Food & Drink Shop   0.5
1        Coffee Shop   0.5
2  Afghan Restaurant   0.0
3               Park   0.0
4        Music Venue   0.0


----Cedarbrae----
                  venue  freq
0                 Hotel  0.09
1              Pharmacy  0.09
2  Caribbean Restaurant  0.09
3            Steakhouse  0.09
4      Sushi Restaurant  0.09


----Scarborough Village----


Now let's create a dataframe that contains top 10 venue categories for every neighborhood.

In [91]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [97]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Zoo,National Park,Spa,Dessert Shop,Campground,Pharmacy,Bakery,Farm,Farmers Market,Fast Food Restaurant
1,M1C,Pub,Athletics & Sports,Park,Grocery Store,Italian Restaurant,BBQ Joint,Beach,Cosmetics Shop,Discount Store,Ethiopian Restaurant
2,M1E,Breakfast Spot,Smoothie Shop,Coffee Shop,Fried Chicken Joint,Park,Grocery Store,Liquor Store,Sports Bar,Pizza Place,Burger Joint
3,M1G,Food & Drink Shop,Coffee Shop,Ethiopian Restaurant,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Filipino Restaurant,Field
4,M1H,Sushi Restaurant,Coffee Shop,Music Store,Hotel,Pharmacy,Caribbean Restaurant,Hakka Restaurant,Indian Restaurant,Gym,Steakhouse


### Cluster neighborhoods

In [98]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int32)

In [99]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [100]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = neighborhoods_venues_sorted.set_index('PostalCode').merge(geo_coord, left_on='PostalCode', right_on='Postal Code')

toronto_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Postal Code,Latitude,Longitude
0,4,Zoo,National Park,Spa,Dessert Shop,Campground,Pharmacy,Bakery,Farm,Farmers Market,Fast Food Restaurant,M1B,43.806686,-79.194353
1,1,Pub,Athletics & Sports,Park,Grocery Store,Italian Restaurant,BBQ Joint,Beach,Cosmetics Shop,Discount Store,Ethiopian Restaurant,M1C,43.784535,-79.160497
2,1,Breakfast Spot,Smoothie Shop,Coffee Shop,Fried Chicken Joint,Park,Grocery Store,Liquor Store,Sports Bar,Pizza Place,Burger Joint,M1E,43.763573,-79.188711
3,0,Food & Drink Shop,Coffee Shop,Ethiopian Restaurant,Food,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Filipino Restaurant,Field,M1G,43.770992,-79.216917
4,1,Sushi Restaurant,Coffee Shop,Music Store,Hotel,Pharmacy,Caribbean Restaurant,Hakka Restaurant,Indian Restaurant,Gym,Steakhouse,M1H,43.773136,-79.239476


In [104]:

import matplotlib.pyplot as plt
import matplotlib as matp
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = plt.cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [matp.colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Postal Code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters