# Part 1

## Lybraries

In [1]:
!pip install bs4



In [2]:
import pandas as pd

from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

## Webscraping

In [3]:
# OK - The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
# OK - Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
# OK - More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.
# OK - If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
# OK - Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
# OK - In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

In [6]:
soup = BeautifulSoup(data,"html5lib")

In [7]:
print(soup.title)
from IPython.display import display_html
display_html(str(soup.table),raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


0,1,2,3,4,5,6,7,8
M1A Not assigned,M2A Not assigned,M3A North York (Parkwoods),M4A North York (Victoria Village),M5A Downtown Toronto (Regent Park / Harbourfront),M6A North York (Lawrence Manor / Lawrence Heights),M7A Queen's Park (Ontario Provincial Government),M8A Not assigned,M9A Etobicoke (Islington Avenue)
M1B Scarborough (Malvern / Rouge),M2B Not assigned,M3B North York (Don Mills) North,M4B East York (Parkview Hill / Woodbine Gardens),"M5B Downtown Toronto (Garden District, Ryerson)",M6B North York (Glencairn),M7B Not assigned,M8B Not assigned,M9B Etobicoke (West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)
M1C Scarborough (Rouge Hill / Port Union / Highland Creek),M2C Not assigned,M3C North York (Don Mills) South (Flemingdon Park),M4C East York (Woodbine Heights),M5C Downtown Toronto (St. James Town),M6C York (Humewood-Cedarvale),M7C Not assigned,M8C Not assigned,M9C Etobicoke (Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood)
M1E Scarborough (Guildwood / Morningside / West Hill),M2E Not assigned,M3E Not assigned,M4E East Toronto (The Beaches),M5E Downtown Toronto (Berczy Park),M6E York (Caledonia-Fairbanks),M7E Not assigned,M8E Not assigned,M9E Not assigned
M1G Scarborough (Woburn),M2G Not assigned,M3G Not assigned,M4G East York (Leaside),M5G Downtown Toronto (Central Bay Street),M6G Downtown Toronto (Christie),M7G Not assigned,M8G Not assigned,M9G Not assigned
M1H Scarborough (Cedarbrae),M2H North York (Hillcrest Village),M3H North York (Bathurst Manor / Wilson Heights / Downsview North),M4H East York (Thorncliffe Park),M5H Downtown Toronto (Richmond / Adelaide / King),M6H West Toronto (Dufferin / Dovercourt Village),M7H Not assigned,M8H Not assigned,M9H Not assigned
M1J Scarborough (Scarborough Village),M2J North York (Fairview / Henry Farm / Oriole),M3J North York (Northwood Park / York University),M4J East York East Toronto (The Danforth East),M5J Downtown Toronto (Harbourfront East / Union Station / Toronto Islands),M6J West Toronto (Little Portugal / Trinity),M7J Not assigned,M8J Not assigned,M9J Not assigned
M1K Scarborough (Kennedy Park / Ionview / East Birchmount Park),M2K North York (Bayview Village),M3K North York (Downsview) East (CFB Toronto),M4K East Toronto (The Danforth West / Riverdale),M5K Downtown Toronto (Toronto Dominion Centre / Design Exchange),M6K West Toronto (Brockton / Parkdale Village / Exhibition Place),M7K Not assigned,M8K Not assigned,M9K Not assigned
M1L Scarborough (Golden Mile / Clairlea / Oakridge),M2L North York (York Mills / Silver Hills),M3L North York (Downsview) West,M4L East Toronto (India Bazaar / The Beaches West),M5L Downtown Toronto (Commerce Court / Victoria Hotel),M6L North York (North Park / Maple Leaf Park / Upwood Park),M7L Not assigned,M8L Not assigned,M9L North York (Humber Summit)
M1M Scarborough (Cliffside / Cliffcrest / Scarborough Village West),M2M North York (Willowdale / Newtonbrook),M3M North York (Downsview) Central,M4M East Toronto (Studio District),M5M North York (Bedford Park / Lawrence Manor East),M6M York (Del Ray / Mount Dennis / Keelsdale and Silverthorn),M7M Not assigned,M8M Not assigned,M9M North York (Humberlea / Emery)


Find the html table in the web page

In [8]:
#find a html table in the web page
table = soup.table # in html table is represented by the tag <table>
print(table.tr.td.prettify()) # print to understand the structure

<td style="width:11%;">
 <p>
  M1A
  <br/>
  <span style="font-size:85%;">
   Not assigned
  </span>
 </p>
</td>



Populate the dataframe 'neighborhoods'

In [9]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
neighborhoods = pd.DataFrame(columns=column_names)

for i,row in enumerate(table.find_all("tr")):
  #print("row",i)
  cells=row.find_all('td')
  for j,cell in enumerate(cells):
    postal_code = cell.p.text[:3]
    temp = cell.p.span.text
    temp = temp.split('(')
    if(len(temp) > 1):
      borough = temp[0]
      neighborhood = temp[1].replace('/', ', ').replace(')','')
    else: 
      borough = temp[0]
      neighborhood = temp[0]
    
    #print('colunm',j,"cell",postal_code, '\n', borough, '\n', neighborhood, '\n', '\n\n')
    neighborhoods = neighborhoods.append({"PostalCode":postal_code, "Borough":borough, "Neighborhood":neighborhood}, ignore_index=True)

print(neighborhoods.shape)
neighborhoods.head()

(180, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"


Check the data and the 'Not assigned' values

In [10]:
neighborhoods['Borough'].value_counts()

Not assigned                                                    77
North York                                                      24
Downtown Toronto                                                17
Scarborough                                                     17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East York                                                        4
East Toronto                                                     4
EtobicokeNorthwest                                               1
MississaugaCanada Post Gateway Processing Centre                 1
Queen's Park                                                     1
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern  

Remove 'Not assigned' values

In [11]:
neighborhoods = neighborhoods.drop(neighborhoods[neighborhoods['Borough'] == 'Not assigned'].index)
print(neighborhoods.shape)
neighborhoods['Borough'].value_counts()

(103, 3)


North York                                                      24
Scarborough                                                     17
Downtown Toronto                                                17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East York                                                        4
East Toronto                                                     4
EtobicokeNorthwest                                               1
MississaugaCanada Post Gateway Processing Centre                 1
Queen's Park                                                     1
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
Downtown TorontoStn A PO Boxes25 The Esplanade                

Fix the "typos"

In [12]:
neighborhoods = neighborhoods.replace('MississaugaCanada Post Gateway Processing Centre', 'Mississauga')
neighborhoods = neighborhoods.replace('Downtown TorontoStn A PO Boxes25 The Esplanade', 'Downtown Toronto')
neighborhoods = neighborhoods.replace('East YorkEast Toronto', 'East York, East Toronto')
neighborhoods = neighborhoods.replace('EtobicokeNorthwest', 'Etobicoke, Northwest')
neighborhoods = neighborhoods.replace('East TorontoBusiness reply mail Processing Centre969 Eastern', 'East Toronto')

neighborhoods = neighborhoods.replace('Ontario Provincial Government', "Queen's Park")
neighborhoods = neighborhoods.replace('Don MillsNorth', 'Don Mills')
neighborhoods = neighborhoods.replace('Don MillsSouth', 'Don Mills')
neighborhoods = neighborhoods.replace('Caledonia-Fairbanks', 'York')
neighborhoods = neighborhoods.replace('DownsviewEast  ', 'Downsview')
neighborhoods = neighborhoods.replace('DownsviewWest', 'Downsview')
neighborhoods = neighborhoods.replace('DownsviewCentral', 'Downsview')
neighborhoods = neighborhoods.replace('WillowdaleSouth', 'Willowdale')
neighborhoods = neighborhoods.replace('DownsviewNorthwest', 'Downsview')
neighborhoods = neighborhoods.replace('WillowdaleWest', 'Willowdale')
neighborhoods = neighborhoods.replace('Enclave of L4W', 'Mississauga')
neighborhoods = neighborhoods.replace('Enclave of M4L', 'East Toronto')

neighborhoods['Borough'].value_counts()

North York                 24
Downtown Toronto           18
Scarborough                17
Etobicoke                  11
Central Toronto             9
West Toronto                6
East Toronto                5
York                        5
East York                   4
East York, East Toronto     1
Mississauga                 1
Queen's Park                1
Etobicoke, Northwest        1
Name: Borough, dtype: int64

In [13]:
neighborhoods.shape

(103, 3)

# Part 2

In [14]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Queen's Park


I need to get the latitude and the longitude coordinates of each **neighborhood**

In [15]:
!pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[?25l[K     |███▎                            | 10 kB 23.3 MB/s eta 0:00:01[K     |██████▋                         | 20 kB 30.5 MB/s eta 0:00:01[K     |██████████                      | 30 kB 36.2 MB/s eta 0:00:01[K     |█████████████▎                  | 40 kB 40.9 MB/s eta 0:00:01[K     |████████████████▋               | 51 kB 34.7 MB/s eta 0:00:01[K     |████████████████████            | 61 kB 37.3 MB/s eta 0:00:01[K     |███████████████████████▎        | 71 kB 28.5 MB/s eta 0:00:01[K     |██████████████████████████▋     | 81 kB 29.8 MB/s eta 0:00:01[K     |██████████████████████████████  | 92 kB 31.7 MB/s eta 0:00:01[K     |████████████████████████████████| 98 kB 7.6 MB/s 
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [16]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

def calc_latlng(row):
  neighbor = row['Neighborhood'].split(',')[0]
  address = '{}, Toronto, Ontario'.format(neighbor)

  geolocator = Nominatim(user_agent="ny_explorer")
  location = geolocator.geocode(address)
  if (location):
    print('The geograpical coordinate of {} are {}, {}.'.format(neighbor, location.latitude, location.longitude))
    return [location.latitude, location.longitude]
  else:
    print('error -> ', neighbor, '\n\n')
    return [0.0, 0.0]

In [17]:
neighborhoods['latlng'] = neighborhoods.apply(lambda x: calc_latlng(x), axis=1)

neighborhoods.head()

The geograpical coordinate of Parkwoods are 43.7587999, -79.3201966.
The geograpical coordinate of Victoria Village are 43.732658, -79.3111892.
The geograpical coordinate of Regent Park  are 43.6607056, -79.3604569.
The geograpical coordinate of Lawrence Manor  are 43.7220788, -79.4375067.
The geograpical coordinate of Queen's Park are 43.659659, -79.3903399.
The geograpical coordinate of Islington Avenue are 43.688307, -79.542802.
The geograpical coordinate of Malvern  are 43.8091955, -79.2217008.
The geograpical coordinate of Don Mills are 43.775347, -79.3459439.
The geograpical coordinate of Parkview Hill  are 43.7062977, -79.3219073.
The geograpical coordinate of Garden District are 43.6564995, -79.3771141.
The geograpical coordinate of Glencairn are 43.7087117, -79.4406853.
The geograpical coordinate of West Deane Park  are 43.6631995, -79.5685684.
The geograpical coordinate of Rouge Hill  are 43.7802711, -79.1304992.
The geograpical coordinate of Don Mills are 43.775347, -79.3459

Unnamed: 0,PostalCode,Borough,Neighborhood,latlng
2,M3A,North York,Parkwoods,"[43.7587999, -79.3201966]"
3,M4A,North York,Victoria Village,"[43.732658, -79.3111892]"
4,M5A,Downtown Toronto,"Regent Park , Harbourfront","[43.6607056, -79.3604569]"
5,M6A,North York,"Lawrence Manor , Lawrence Heights","[43.7220788, -79.4375067]"
6,M7A,Queen's Park,Queen's Park,"[43.659659, -79.3903399]"


In [18]:
neighborhoods['Latitude'] = neighborhoods['latlng'].apply(lambda x: x[0])
neighborhoods['Longitude'] = neighborhoods['latlng'].apply(lambda x: x[1])
neighborhoods.drop(columns=['latlng'], inplace=True)

In [19]:
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.758800,-79.320197
3,M4A,North York,Victoria Village,43.732658,-79.311189
4,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.660706,-79.360457
5,M6A,North York,"Lawrence Manor , Lawrence Heights",43.722079,-79.437507
6,M7A,Queen's Park,Queen's Park,43.659659,-79.390340
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.647381,-79.511333
165,M4Y,Downtown Toronto,Church and Wellesley,43.665524,-79.383801
168,M7Y,East Toronto,East Toronto,43.626243,-79.396962
169,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea...",43.649818,-79.494278


# Part 3

## Create a map of Toronto by neighborhoods


In [20]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [21]:
import folium # map rendering library

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## I've decide to do the analysis on the borough that has more neighborhoods

In [23]:
neighborhoods['Borough'].value_counts()

North York                 24
Downtown Toronto           18
Scarborough                17
Etobicoke                  11
Central Toronto             9
West Toronto                6
East Toronto                5
York                        5
East York                   4
East York, East Toronto     1
Mississauga                 1
Queen's Park                1
Etobicoke, Northwest        1
Name: Borough, dtype: int64

Plotting the map just for **'North York'**

In [35]:
ny_df = neighborhoods[neighborhoods['Borough'] == 'North York']

In [36]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ny_df['Latitude'], ny_df['Longitude'], ny_df['Borough'], ny_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.3,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Explore Neighborhoods in North York

### Define Foursquare Credentials and Version


In [26]:
CLIENT_ID = '3UNUMQW45CQVFYKO531O03ZXL2WVZH3RZXJFD53QOMT2IMYG' # your Foursquare ID
CLIENT_SECRET = 'EHYX0QAJJEGEMF0E4IF1QPZTEMWJGVBXFK51PNFWFQ2GXOWJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3UNUMQW45CQVFYKO531O03ZXL2WVZH3RZXJFD53QOMT2IMYG
CLIENT_SECRET:EHYX0QAJJEGEMF0E4IF1QPZTEMWJGVBXFK51PNFWFQ2GXOWJ


### Analyze some big numbers

In [27]:
def getNearbyVenues(neigh_list, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(neigh_list, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now write the code to run the above function on each neighborhood and create a new dataframe called _ny_venues_.


In [38]:
ny_venues = getNearbyVenues(names=ny_df['Neighborhood'],
                                   latitudes=ny_df['Latitude'],
                                   longitudes=ny_df['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor ,  Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor ,  Wilson Heights ,  Downsview North
Fairview ,  Henry Farm ,  Oriole
Northwood Park ,  York University
Bayview Village
Downsview
York Mills ,  Silver Hills
Downsview
North Park ,  Maple Leaf Park ,  Upwood Park
Humber Summit
Willowdale ,  Newtonbrook
Downsview
Bedford Park ,  Lawrence Manor East
Humberlea ,  Emery
Willowdale
Downsview
York Mills West
Willowdale


In [39]:
print(ny_venues.shape)
ny_venues.head()

(516, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7588,-79.320197,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.7588,-79.320197,LCBO,43.757774,-79.314257,Liquor Store
2,Parkwoods,43.7588,-79.320197,Shoppers Drug Mart,43.760857,-79.324961,Pharmacy
3,Parkwoods,43.7588,-79.320197,Petro-Canada,43.75795,-79.315187,Gas Station
4,Parkwoods,43.7588,-79.320197,Pizza Pizza,43.760231,-79.325666,Pizza Place


In [43]:
print('There are {} uniques categories in North York.'.format(len(ny_venues['Venue Category'].unique())))

There are 104 uniques categories in North York.


In [41]:
ny_venues[['Neighborhood', 'Venue', 'Venue Category']].groupby(by=['Neighborhood']).nunique().reset_index().sort_values(by=['Venue', 'Venue Category'], ascending=False)

Unnamed: 0,Neighborhood,Venue,Venue Category
5,"Fairview , Henry Farm , Oriole",64,44
3,Don Mills,59,39
15,Willowdale,44,31
16,"Willowdale , Newtonbrook",44,31
7,Hillcrest Village,43,34
17,"York Mills , Silver Hills",17,14
18,York Mills West,17,14
13,Parkwoods,12,12
1,Bayview Village,11,10
4,Downsview,10,9


'Fairview , Henry Farm , Oriole' neighbohood has more **venues** on North York,

'Fairview , Henry Farm , Oriole' neighbohood has more **diversity of venues** on North York.


### Analyze Each Neighborhood

In [47]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

print(ny_onehot.shape)
ny_onehot.head()

(516, 105)


Unnamed: 0,Neighborhood,ATM,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Auto Garage,BBQ Joint,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Business Service,Café,Caribbean Restaurant,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Construction & Landscaping,Convenience Store,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store,Event Space,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,...,Laundry Service,Liquor Store,Luggage Store,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Movie Theater,Moving Target,Optical Shop,Outdoor Supply Store,Park,Persian Restaurant,Pharmacy,Pizza Place,Playground,Poke Place,Portuguese Restaurant,Poutine Place,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shop & Service,Shopping Mall,South American Restaurant,Spa,Sporting Goods Shop,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Video Game Store,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [49]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()

print(ny_grouped.shape)
ny_grouped.head()

(19, 105)


Unnamed: 0,Neighborhood,ATM,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Auto Garage,BBQ Joint,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Business Service,Café,Caribbean Restaurant,Chinese Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Construction & Landscaping,Convenience Store,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store,Event Space,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,...,Laundry Service,Liquor Store,Luggage Store,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Movie Theater,Moving Target,Optical Shop,Outdoor Supply Store,Park,Persian Restaurant,Pharmacy,Pizza Place,Playground,Poke Place,Portuguese Restaurant,Poutine Place,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Shoe Store,Shop & Service,Shopping Mall,South American Restaurant,Spa,Sporting Goods Shop,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Video Game Store,Women's Store,Yoga Studio
0,"Bathurst Manor , Wilson Heights , Downsview ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park , Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Don Mills,0.0,0.0,0.016129,0.0,0.016129,0.0,0.0,0.032258,0.032258,0.016129,0.0,0.016129,0.0,0.0,0.016129,0.016129,0.016129,0.0,0.0,0.0,0.016129,0.016129,0.145161,0.080645,0.0,0.016129,0.016129,0.0,0.0,0.016129,0.0,0.0,0.0,0.016129,0.0,0.0,0.048387,0.0,0.0,...,0.0,0.016129,0.016129,0.0,0.0,0.0,0.0,0.016129,0.0,0.016129,0.0,0.0,0.0,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048387,0.016129,0.032258,0.016129,0.0,0.016129,0.0,0.016129,0.016129,0.0,0.0,0.016129,0.0,0.016129,0.016129,0.016129,0.0
4,Downsview,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Let's print each neighborhood along with the top 5 most common venues


In [50]:
num_top_venues = 5

for hood in ny_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ny_grouped[ny_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor ,  Wilson Heights ,  Downsview North----
                      venue  freq
0                      Park  0.25
1         Convenience Store  0.25
2                Playground  0.25
3            Baseball Field  0.25
4  Mediterranean Restaurant  0.00


----Bayview Village----
                  venue  freq
0                  Bank  0.18
1           Gas Station  0.09
2           Fish Market  0.09
3  Outdoor Supply Store  0.09
4        Breakfast Spot  0.09


----Bedford Park ,  Lawrence Manor East----
                        venue  freq
0  Construction & Landscaping   1.0
1                         ATM   0.0
2          Persian Restaurant   0.0
3        Outdoor Supply Store   0.0
4                Optical Shop   0.0


----Don Mills----
                  venue  freq
0        Clothing Store  0.15
1           Coffee Shop  0.08
2            Restaurant  0.05
3  Fast Food Restaurant  0.05
4   Japanese Restaurant  0.05


----Downsview----
                    venue  freq
0             Bu

#### Let's put that into a _pandas_ dataframe


In [51]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [61]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor , Wilson Heights , Downsview ...",Park,Playground,Baseball Field,Convenience Store,Fast Food Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
1,Bayview Village,Bank,Outdoor Supply Store,Sporting Goods Shop,Persian Restaurant,Pizza Place,Fast Food Restaurant,Sandwich Place,Fish Market,Breakfast Spot,Gas Station
2,"Bedford Park , Lawrence Manor East",Construction & Landscaping,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store,Event Space
3,Don Mills,Clothing Store,Coffee Shop,Japanese Restaurant,Fast Food Restaurant,Restaurant,Sandwich Place,Juice Bar,Bank,Bakery,Food Court
4,Downsview,Bus Station,Gym / Fitness Center,Gym Pool,Gas Station,Furniture / Home Store,Coffee Shop,French Restaurant,Playground,Metro Station,Event Space


### Cluster Neighborhoods


Run _k_-means to cluster the neighborhood into 5 clusters.


In [62]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 3, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.


In [63]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny_df

# merge ny_grouped with ny_df to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print(ny_merged.shape)
ny_merged.head() # check the last columns!

(24, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M3A,North York,Parkwoods,43.7588,-79.320197,1,ATM,Shopping Mall,Gas Station,Laundry Service,Liquor Store,Discount Store,Pharmacy,Pizza Place,Chinese Restaurant,Caribbean Restaurant
3,M4A,North York,Victoria Village,43.732658,-79.311189,4,Park,Thai Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
5,M6A,North York,"Lawrence Manor , Lawrence Heights",43.722079,-79.437507,1,Park,Bank,Doctor's Office,Electronics Store,Kids Store,Furniture / Home Store,Fruit & Vegetable Store,Dance Studio,Deli / Bodega,Department Store
11,M3B,North York,Don Mills,43.775347,-79.345944,1,Clothing Store,Coffee Shop,Japanese Restaurant,Fast Food Restaurant,Restaurant,Sandwich Place,Juice Bar,Bank,Bakery,Food Court
14,M6B,North York,Glencairn,43.708712,-79.440685,1,Grocery Store,Asian Restaurant,Playground,Bakery,Japanese Restaurant,Metro Station,Yoga Studio,Fast Food Restaurant,Department Store,Dessert Shop


Finally, let's visualize the resulting clusters


In [66]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters


Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.


#### Cluster 1


In [67]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
77,North York,0,Convenience Store,American Restaurant,Coffee Shop,Chinese Restaurant,Fish & Chips Shop,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store


#### Cluster 2


In [68]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,North York,1,ATM,Shopping Mall,Gas Station,Laundry Service,Liquor Store,Discount Store,Pharmacy,Pizza Place,Chinese Restaurant,Caribbean Restaurant
5,North York,1,Park,Bank,Doctor's Office,Electronics Store,Kids Store,Furniture / Home Store,Fruit & Vegetable Store,Dance Studio,Deli / Bodega,Department Store
11,North York,1,Clothing Store,Coffee Shop,Japanese Restaurant,Fast Food Restaurant,Restaurant,Sandwich Place,Juice Bar,Bank,Bakery,Food Court
14,North York,1,Grocery Store,Asian Restaurant,Playground,Bakery,Japanese Restaurant,Metro Station,Yoga Studio,Fast Food Restaurant,Department Store,Dessert Shop
20,North York,1,Clothing Store,Coffee Shop,Japanese Restaurant,Fast Food Restaurant,Restaurant,Sandwich Place,Juice Bar,Bank,Bakery,Food Court
46,North York,1,Italian Restaurant,Café,Restaurant,Coffee Shop,Mexican Restaurant,Sushi Restaurant,Bakery,Indian Restaurant,Playground,Burger Joint
47,North York,1,Park,Playground,Baseball Field,Convenience Store,Fast Food Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
55,North York,1,Clothing Store,Coffee Shop,Shoe Store,Restaurant,Fast Food Restaurant,Food Court,Japanese Restaurant,Cosmetics Shop,Juice Bar,Bank
64,North York,1,Bank,Outdoor Supply Store,Sporting Goods Shop,Persian Restaurant,Pizza Place,Fast Food Restaurant,Sandwich Place,Fish Market,Breakfast Spot,Gas Station
65,North York,1,Bus Station,Gym / Fitness Center,Gym Pool,Gas Station,Furniture / Home Store,Coffee Shop,French Restaurant,Playground,Metro Station,Event Space


#### Cluster 3


In [69]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
56,North York,2,Park,Baseball Field,Filipino Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store


#### Cluster 4

In [70]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
85,North York,3,Construction & Landscaping,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store,Doctor's Office,Electronics Store,Event Space


#### Cluster 5

In [71]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,North York,4,Park,Thai Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Cosmetics Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
