## Part 1 
### Canada borough data preprocessing

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Upload html file with post codes [data source](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [2]:
with open('C:/Users/Random/Documents/GitHub/Coursera_Capstone/ZIP_list_Canada.html') as html_file:
    soup = BeautifulSoup(html_file, 'lxml')
    
table  = soup.table

In [3]:
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>Ea

### Extract data from the html source

In [4]:
arr = []
for rows in table.find_all('tr'):
    text = rows.text
    arr.append(text)
code = []
borough = []
neighborhood = []

for line in arr:
    elem = line.split('\n')
    code.append(elem[1])
    borough.append(elem[3])
    neighborhood.append(elem[5])
    
column_names = [code[0], borough[0], neighborhood[0]]
code = code[1:]
borough = borough[1:]
neighborhood = neighborhood[1:]
print(column_names)

['Postal code', 'Borough', 'Neighborhood']


In [5]:
code[:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [6]:
borough[:5]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [7]:
neighborhood[:5]

['', '', 'Parkwoods', 'Victoria Village', 'Regent Park / Harbourfront']

### Creating dataframe from Wiki data

In [8]:
df = pd.DataFrame([code, borough, neighborhood]).T
df.columns = column_names
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Cleaning the dataset

In [9]:
#Removing the boroughs that are "Not assigned"

df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [10]:
# Combining Neighborhoods with the same Postcode and Borough 
df = df.groupby(['Postal code', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [11]:
# Checking duplicated Neighborhoods
duplicates = df[df.duplicated(['Neighborhood'])]
display(duplicates)

Unnamed: 0,Postal code,Borough,Neighborhood
24,M2R,North York,Willowdale
27,M3C,North York,Don Mills
31,M3L,North York,Downsview
32,M3M,North York,Downsview
33,M3N,North York,Downsview


In [12]:
# Check duplicates
display(df[df['Neighborhood'] == 'Willowdale'])
display(df[df['Neighborhood'] == 'Don Mills'])
display(df[df['Neighborhood'] == 'Downsview'])

Unnamed: 0,Postal code,Borough,Neighborhood
22,M2N,North York,Willowdale
24,M2R,North York,Willowdale


Unnamed: 0,Postal code,Borough,Neighborhood
26,M3B,North York,Don Mills
27,M3C,North York,Don Mills


Unnamed: 0,Postal code,Borough,Neighborhood
30,M3K,North York,Downsview
31,M3L,North York,Downsview
32,M3M,North York,Downsview
33,M3N,North York,Downsview


In [13]:
with pd.option_context('display.max_rows', 500, 'display.max_columns', 50):
    display(df)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West
9,M1N,Scarborough,Birch Cliff / Cliffside West


In [14]:
sum(df.loc[:, 'Neighborhood'] == 'Not assigned') # Check that all 'Non assigned' cases are dropped

0

In [15]:
# Replacing not assigned neighborhoods with the borough values:

df.loc[df['Neighborhood'].str.find('Not assigned') != -1]

Unnamed: 0,Postal code,Borough,Neighborhood


In [16]:
df.shape

(103, 3)

## Part 2
### Adding latitude and longitude to each neighborhood

In [17]:
position = pd.read_csv('Geospatial_Coordinates.csv')
position.columns = ['Postcode', 'Latitude', 'Longitude']
position

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Merging dataframes

In [18]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [19]:
position.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
position.rename(columns={'Postcode':'Postal code'}, inplace=True) 

In [21]:
position.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
df = df.merge(position, how='left', on='Postal code')

In [23]:
with pd.option_context('display.max_rows', 500, 'display.max_columns', 10):
    display(df)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


## Part 3 
### Clustering

In [24]:
from sklearn.cluster import KMeans
import folium

In [25]:
lat_mean = df['Latitude'].mean()
lon_mean = df['Longitude'].mean()
# create map of New York using latitude and longitude values
toronto_map = folium.Map(location=[lat_mean, lon_mean], zoom_start=10)

# add markers to the map
for lat, lon, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Cluster Neighborhoods in Toronto

In [65]:
CLIENT_ID = 'ESLSXSDHK0QR2ECEVEZMVGPIX1CEQYG2K0303C2ILDFVWRCW' # your Foursquare ID
CLIENT_SECRET = 'QCGQJOWXIWDLFLWSJQNZ2OZ4OYUUFUVGO1ZUTMU4PDHBPRJU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ESLSXSDHK0QR2ECEVEZMVGPIX1CEQYG2K0303C2ILDFVWRCW
CLIENT_SECRET:QCGQJOWXIWDLFLWSJQNZ2OZ4OYUUFUVGO1ZUTMU4PDHBPRJU


In [66]:
# Check duplicates
display(df[df['Neighborhood'] == 'Willowdale'])
display(df[df['Neighborhood'] == 'Don Mills'])
display(df[df['Neighborhood'] == 'Downsview'])

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
21,M2N,North York,Willowdale,43.776428,-79.425376


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
24,M3B,North York,Don Mills,43.735903,-79.346555


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
27,M3K,North York,Downsview,43.741654,-79.497101


In [67]:
# Combining duplicated Neighborhoods and replacing them with single Lat-Lon position

willowdale_mean_lat = df[df['Neighborhood'] == 'Willowdale']['Latitude'].mean()
willowdale_mean_lon = df[df['Neighborhood'] == 'Willowdale']['Longitude'].mean()

don_mills_mean_lat = df[df['Neighborhood'] == 'Don Mills']['Latitude'].mean()
don_mills_mean_lon = df[df['Neighborhood'] == 'Don Mills']['Longitude'].mean()

downsview_mean_lat = df[df['Neighborhood'] == 'Downsview']['Latitude'].mean()
downsview_mean_lon = df[df['Neighborhood'] == 'Downsview']['Longitude'].mean()

In [68]:
# Drop duplicates and check it
df.drop_duplicates(subset=['Neighborhood'], inplace=True)

display(df[df['Neighborhood'] == 'Willowdale'])
display(df[df['Neighborhood'] == 'Don Mills'])
display(df[df['Neighborhood'] == 'Downsview'])

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
21,M2N,North York,Willowdale,43.776428,-79.425376


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
24,M3B,North York,Don Mills,43.735903,-79.346555


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
27,M3K,North York,Downsview,43.741654,-79.497101


In [69]:
df.loc[22, 'Latitude'] = willowdale_mean_lat
df.loc[22, 'Longitude'] = willowdale_mean_lon

df.loc[26, 'Latitude'] = don_mills_mean_lat
df.loc[26, 'Longitude'] = don_mills_mean_lon

df.loc[30, 'Latitude'] = downsview_mean_lat
df.loc[30, 'Longitude'] = downsview_mean_lon

df.reset_index(inplace=True, drop=True)

In [70]:
df[df['Neighborhood'] == 'Upper Rouge']

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude


In [71]:
display(df)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
92,M9N,York,Weston,43.706876,-79.518188
93,M9P,Etobicoke,Westmount,43.696319,-79.532242
94,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
95,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


In [72]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000, LIMIT=100):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [73]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'])

Malvern / Rouge
Rouge Hill / Port Union / Highland Creek
Guildwood / Morningside / West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park / Ionview / East Birchmount Park
Golden Mile / Clairlea / Oakridge
Cliffside / Cliffcrest / Scarborough Village West
Birch Cliff / Cliffside West
Dorset Park / Wexford Heights / Scarborough Town Centre
Wexford / Maryvale
Agincourt
Clarks Corners / Tam O'Shanter / Sullivan
Milliken / Agincourt North / Steeles East / L'Amoreaux East
Steeles West / L'Amoreaux West
Hillcrest Village
Fairview / Henry Farm / Oriole
Bayview Village
York Mills / Silver Hills
Willowdale / Newtonbrook
Willowdale
York Mills West
Parkwoods
Don Mills
Bathurst Manor / Wilson Heights / Downsview North
Northwood Park / York University
Downsview
Victoria Village
Parkview Hill / Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West / Riverdale
India Bazaar / The Beaches West
Studio District
Lawrence Park
Davisville North
North Toron

In [74]:
print(toronto_venues.shape)
toronto_venues.head()

(4823, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern / Rouge,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,Malvern / Rouge,43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
2,Malvern / Rouge,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,Malvern / Rouge,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
4,Malvern / Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant


In [87]:
toronto_venues.loc[toronto_venues['Venue Category'] == 'Coffee shop', 'Venue Category'] = 'Café'
toronto_venues.loc[toronto_venues['Venue Category'] == 'Café', 'Venue Category'] = 'Cafe'
toronto_venues.loc[toronto_venues['Venue Category'].str.contains("Restaurant"), 'Venue Category'] = 'Restaurant'
toronto_venues.loc[toronto_venues['Venue Category'].str.contains("Store"), 'Venue Category'] = 'Store'
toronto_venues.loc[toronto_venues['Venue Category'].str.contains("Club"), 'Venue Category'] = 'Club'
toronto_venues.loc[toronto_venues['Venue Category'].str.contains("Shop"), 'Venue Category'] = 'Shop'

In [88]:
toronto_venues['Venue Category'].unique()

array(['Spa', 'Restaurant', 'Store', 'Bank', 'Shop', 'Trail', 'Gym',
       'Supermarket', 'Burger Joint', 'Breakfast Spot', 'Playground',
       'Park', 'Fried Chicken Joint', 'Pizza Place', 'Sports Bar',
       'Pharmacy', 'Sandwich Place', 'Bus Line', 'Athletics & Sports',
       'Bakery', 'Gas Station', 'Yoga Studio', 'Wings Joint', 'Lounge',
       'Construction & Landscaping', 'Intersection', 'Train Station',
       'Bowling Alley', 'Hockey Arena', 'Light Rail Station',
       'Rental Car Location', 'Pub', 'Metro Station', 'Bus Station',
       'Soccer Field', 'Diner', 'General Entertainment', 'Beach', 'Cafe',
       'Skating Rink', 'College Stadium', 'Gym Pool', 'Gaming Cafe',
       'Rental Service', 'Fish Market', 'Badminton Court', 'Flea Market',
       'Noodle House', 'Pool Hall', 'Pool', 'Golf Course', 'Event Space',
       'BBQ Joint', 'Food Court', 'Other Great Outdoors', 'Auto Garage',
       'Tennis Court', 'Recreation Center',
       'Residential Building (Apartment / 

In [89]:
toronto_venues[toronto_venues['Neighborhood'] == 'Upper Rouge'] # no venues found (Park area)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category


### One-hot encoding

In [90]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern / Rouge,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,Malvern / Rouge,43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
2,Malvern / Rouge,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Store
3,Malvern / Rouge,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Restaurant
4,Malvern / Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Restaurant


In [91]:
toronto_venues[toronto_venues['Venue Category'] == 'Neighborhood'].head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1317,Studio District,43.659526,-79.340923,Leslieville,43.66207,-79.337856,Neighborhood
1692,Moore Park / Summerhill East,43.689574,-79.38316,Summerhill,43.682976,-79.389123,Neighborhood
1779,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049,Summerhill,43.682976,-79.389123,Neighborhood
2100,"Garden District, Ryerson",43.657162,-79.378937,Downtown Toronto,43.653232,-79.385296,Neighborhood
2255,St. James Town,43.651494,-79.375418,Downtown Toronto,43.653232,-79.385296,Neighborhood


In [92]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot.insert(0, 'Neighborhood name', toronto_venues['Neighborhood'] )

toronto_onehot.head()

Unnamed: 0,Neighborhood name,Airport,Amphitheater,Animal Shelter,Aquarium,Art Gallery,Art Museum,Athletics & Sports,Auto Dealership,Auto Garage,...,Theater,Track,Trail,Train Station,University,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio,Zoo
0,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Malvern / Rouge,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
toronto_grouped = toronto_onehot.groupby('Neighborhood name').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood name,Airport,Amphitheater,Animal Shelter,Aquarium,Art Gallery,Art Museum,Athletics & Sports,Auto Dealership,Auto Garage,...,Theater,Track,Trail,Train Station,University,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio,Zoo
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,Alderwood / Long Branch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,Bedford Park / Lawrence Manor East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.022222,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,Willowdale / Newtonbrook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
93,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
94,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
95,York Mills / Silver Hills,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [94]:
arr_1 = toronto_grouped['Neighborhood name'].unique()
arr_2 = df['Neighborhood'].unique()

In [95]:
list(set(arr_2) - set(arr_1))

[]

In [96]:
df = df[df['Neighborhood'] != 'Upper Rouge']
df.reset_index(drop=True, inplace=True)

### Top 5 venues for each neighborhood

In [97]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood name']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood name'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
            venue  freq
0      Restaurant  0.50
1            Shop  0.12
2     Pizza Place  0.05
3  Sandwich Place  0.05
4          Bakery  0.05


----Alderwood / Long Branch----
           venue  freq
0          Store  0.29
1           Shop  0.17
2    Pizza Place  0.08
3     Restaurant  0.04
4  Garden Center  0.04


----Bathurst Manor / Wilson Heights / Downsview North----
         venue  freq
0         Shop  0.20
1   Restaurant  0.17
2         Bank  0.07
3         Park  0.07
4  Pizza Place  0.07


----Bayview Village----
         venue  freq
0   Restaurant  0.29
1        Store  0.14
2  Gas Station  0.14
3         Bank  0.14
4      Dog Run  0.07


----Bedford Park / Lawrence Manor East----
            venue  freq
0      Restaurant  0.29
1            Shop  0.20
2           Store  0.11
3  Sandwich Place  0.04
4     Pizza Place  0.04


----Berczy Park----
        venue  freq
0  Restaurant  0.21
1        Shop  0.17
2        Cafe  0.06
3       Hotel  0.06
4       Store  0.

4  Sandwich Place  0.05


----Roselawn----
        venue  freq
0  Restaurant  0.30
1        Shop  0.13
2        Cafe  0.09
3    Pharmacy  0.09
4        Bank  0.09


----Rouge Hill / Port Union / Highland Creek----
            venue  freq
0      Restaurant   0.2
1    Burger Joint   0.2
2      Playground   0.2
3            Park   0.2
4  Breakfast Spot   0.2


----Runnymede / Swansea----
         venue  freq
0   Restaurant  0.21
1         Shop  0.20
2         Cafe  0.09
3       Bakery  0.05
4  Pizza Place  0.05


----Runnymede / The Junction North----
         venue  freq
0        Store  0.19
1  Pizza Place  0.11
2   Restaurant  0.11
3         Shop  0.11
4         Park  0.06


----Scarborough Village----
           venue  freq
0     Restaurant  0.25
1           Shop  0.25
2          Store  0.17
3  Bowling Alley  0.08
4  Train Station  0.08


----South Steeles / Silverstone / Humbergate / Jamestown / Mount Olive / Beaumond Heights / Thistletown / Albion Gardens----
            venue  freq


### Combining venue types:
1) Cafe and Coffee shop  
2) All restaurants merged into one group

In [98]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [99]:
columns = ['Neighborhood name']
for ind in np.arange(5):
    columns.append('{} Most Common Venue'.format(ind+1))

# create a new dataframe
sorted_venues = pd.DataFrame(columns=columns)
sorted_venues['Neighborhood name'] = toronto_grouped['Neighborhood name']

for ind in np.arange(toronto_grouped.shape[0]):
    sorted_venues.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

sorted_venues.head()

Unnamed: 0,Neighborhood name,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
0,Agincourt,Restaurant,Shop,Pizza Place,Sandwich Place,Bakery
1,Alderwood / Long Branch,Store,Shop,Pizza Place,Sandwich Place,Gym
2,Bathurst Manor / Wilson Heights / Downsview North,Shop,Restaurant,Pizza Place,Bank,Park
3,Bayview Village,Restaurant,Gas Station,Bank,Store,Cafe
4,Bedford Park / Lawrence Manor East,Restaurant,Shop,Store,Pizza Place,Bank


In [100]:
sorted_venues.shape

(97, 6)

### K means clustering

In [101]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood name', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=2).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:] 

array([0, 4, 1, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 2, 4,
       2, 2, 1, 2, 2, 4, 4, 2, 2, 2, 2, 1, 2, 2, 2, 1, 4, 4, 2, 2, 4, 4,
       2, 1, 4, 1, 4, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 2, 1, 1, 2, 2, 2,
       4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 4,
       1, 4, 2, 0, 2, 0, 1, 3, 0])

In [102]:
sorted_venues

Unnamed: 0,Neighborhood name,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
0,Agincourt,Restaurant,Shop,Pizza Place,Sandwich Place,Bakery
1,Alderwood / Long Branch,Store,Shop,Pizza Place,Sandwich Place,Gym
2,Bathurst Manor / Wilson Heights / Downsview North,Shop,Restaurant,Pizza Place,Bank,Park
3,Bayview Village,Restaurant,Gas Station,Bank,Store,Cafe
4,Bedford Park / Lawrence Manor East,Restaurant,Shop,Store,Pizza Place,Bank
...,...,...,...,...,...,...
92,Willowdale / Newtonbrook,Restaurant,Shop,Cafe,Pizza Place,Bank
93,Woburn,Restaurant,Shop,Park,Pharmacy,Deli / Bodega
94,Woodbine Heights,Shop,Pizza Place,Park,Store,Spa
95,York Mills / Silver Hills,Park,Pool,Zoo,Farm,Event Space


In [103]:
# add clustering labels
sorted_venues.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(sorted_venues.set_index('Neighborhood name'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353,2,Restaurant,Store,Trail,Bank,Shop
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497,2,Park,Playground,Burger Joint,Restaurant,Breakfast Spot
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711,2,Restaurant,Shop,Pizza Place,Store,Bank
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Restaurant,Shop,Park,Pharmacy,Deli / Bodega
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2,Restaurant,Bakery,Shop,Store,Bank


In [104]:
import matplotlib.cm as cm
import matplotlib.colors as colors
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
# create map
map_clusters = folium.Map(location=[lat_mean, lon_mean], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Check the clusters

### Cluster 0 (Red)
Pizza / Groceries spreaded far away from city centre

In [105]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
3,Scarborough,0,Restaurant,Shop,Park,Pharmacy,Deli / Bodega
12,Scarborough,0,Restaurant,Shop,Pizza Place,Sandwich Place,Bakery
14,Scarborough,0,Restaurant,Shop,Pizza Place,Park,Pharmacy
21,North York,0,Restaurant,Shop,Pizza Place,Cafe,Bank
22,North York,0,Restaurant,Shop,Pizza Place,Cafe,Bank
41,Central Toronto,0,Restaurant,Shop,Store,Cafe,Pizza Place
80,Mississauga,0,Restaurant,Shop,Hotel,Gym,Pizza Place
85,Etobicoke,0,Restaurant,Park,Shop,Bus Stop,Design Studio


### Cluster 1
Park, Zoo 

In [106]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
7,Scarborough,1,Restaurant,Store,Intersection,Shop,Pharmacy
8,Scarborough,1,Pizza Place,Shop,Beach,Restaurant,Sports Bar
16,North York,1,Shop,Restaurant,Store,Pharmacy,Park
23,North York,1,Shop,Store,Park,Restaurant,Bus Stop
25,North York,1,Shop,Restaurant,Pizza Place,Bank,Park
27,North York,1,Shop,Pizza Place,Park,Store,Spa
28,North York,1,Shop,Store,Lounge,Gym / Fitness Center,Cafe
29,East York,1,Shop,Pizza Place,Brewery,Bakery,Soccer Stadium
30,East York,1,Shop,Pizza Place,Park,Store,Spa
38,Central Toronto,1,College Gym,Trail,Bookstore,Shop,Park


In [107]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
0,Scarborough,2,Restaurant,Store,Trail,Bank,Shop
1,Scarborough,2,Park,Playground,Burger Joint,Restaurant,Breakfast Spot
2,Scarborough,2,Restaurant,Shop,Pizza Place,Store,Bank
4,Scarborough,2,Restaurant,Bakery,Shop,Store,Bank
5,Scarborough,2,Shop,Restaurant,Store,Pizza Place,Train Station
9,Scarborough,2,Restaurant,Gym Pool,Store,Cafe,Park
10,Scarborough,2,Restaurant,Store,Shop,Pharmacy,Light Rail Station
11,Scarborough,2,Restaurant,Store,Pizza Place,Bakery,Burger Joint
13,Scarborough,2,Restaurant,Shop,Store,Pharmacy,Sandwich Place
15,Scarborough,2,Restaurant,Shop,Store,Pizza Place,Bakery


In [108]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
19,North York,3,Park,Pool,Zoo,Farm,Event Space


In [109]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
6,Scarborough,4,Store,Restaurant,Shop,Pizza Place,Burger Joint
17,North York,4,Store,Shop,Restaurant,Sandwich Place,Juice Bar
32,East York,4,Store,Shop,Restaurant,Burger Joint,Bank
44,Downtown Toronto,4,Store,Shop,Park,Restaurant,Metro Station
65,North York,4,Store,Restaurant,Shop,Fried Chicken Joint,Bowling Alley
74,York,4,Store,Shop,Restaurant,Playground,Gas Station
75,York,4,Store,Pizza Place,Restaurant,Shop,Park
83,Etobicoke,4,Store,Shop,Pizza Place,Sandwich Place,Gym
87,Etobicoke,4,Pharmacy,Store,Bank,Shop,Cafe
88,Etobicoke,4,Park,Store,Pizza Place,Restaurant,Bank


In [110]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1 Most Common Venue,2 Most Common Venue,3 Most Common Venue,4 Most Common Venue,5 Most Common Venue
