In [1]:
!pip install bs4



In [2]:
import numpy as np
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


  **Download and Explore Dataset**

Use the Beautiful Soup for extract HTML.

In [3]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url)
#print(soup.prettify())

In [4]:
My_table = soup.find('table',{'class':'wikitable sortable'})

**Tranform the data into a pandas dataframe**

Get Table to Dataframe and Unassigned checks are made in Borough and Neighborhood

In [5]:
Head = My_table.find_all('tr')
Table = []
for th in Head:
    row = np.array(th.getText()[1:-1].split('\n'))
    if row[1] != 'Not assigned': 
        if row[2] == 'Not assigned':
            row[2] = row[1]
        Table.append(row)
        
df_Canada = pd.DataFrame(data=Table[1:], columns=Table[0])
df_Canada

Unnamed: 0,Postal Code,Unnamed: 2,Borough,Unnamed: 4,Neighborhood
0,M1A,,,,Not assigned
1,M2A,,,,Not assigned
2,M3A,,North York,,Parkwoods
3,M4A,,North York,,Victoria Village
4,M5A,,Downtown Toronto,,"Regent Park, Harbourfront"
5,M6A,,North York,,"Lawrence Manor, Lawrence Heights"
6,M7A,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government"
7,M8A,,,,Not assigned
8,M9A,,Etobicoke,,"Islington Avenue, Humber Valley Village"
9,M1B,,Scarborough,,"Malvern, Rouge"



Get Name Columns

In [6]:
Col = df_Canada.columns
df_Canada_Group = df_Canada.groupby(Col[0])[Col[4]].apply(lambda x: ''.join([str(elem+',') for elem in list(x)])).reset_index()
df_Canada_Group.head()

Unnamed: 0,Postal Code,Neighborhood
0,M1A,"Not assigned,"
1,M1B,"Malvern, Rouge,"
2,M1C,"Rouge Hill, Port Union, Highland Creek,"
3,M1E,"Guildwood, Morningside, West Hill,"
4,M1G,"Woburn,"


The next task is essentially transforming this data of nested Python dictionaries into a pandas dataframe. So let's start by creating an empty dataframe.

In [7]:
column_names = ['Postal Code', 'Borough', 'Neighbourhood']

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [8]:
df_repeated = df_Canada.sort_values('Postal Code', ascending=True)
df_repeated.drop_duplicates('Postal Code', keep="last", inplace=True)
df_Canada_Group['Borough'] = df_repeated.reset_index()['Borough']
df_Canada_Group.head()

Unnamed: 0,Postal Code,Neighborhood,Borough
0,M1A,"Not assigned,",
1,M1B,"Malvern, Rouge,",Scarborough
2,M1C,"Rouge Hill, Port Union, Highland Creek,",Scarborough
3,M1E,"Guildwood, Morningside, West Hill,",Scarborough
4,M1G,"Woburn,",Scarborough


**Geospatial data**

In [9]:
!pip install geocoder



Neighborhood has a total of 5 boroughs and 306 neighborhoods. In order to segement the neighborhoods and explore them, we will essentially need a dataset that contains the 5 boroughs and the neighborhoods that exist in each borough as well as the the latitude and logitude coordinates of each neighborhood.

In [10]:
!wget -q -O 'postal_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [11]:
df_csv = pd.read_csv('postal_data.csv')
df_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df_Canada_Group['Latitude'] = df_csv['Latitude']
df_Canada_Group['Longitude'] = df_csv['Longitude']
df_Canada_Group

Unnamed: 0,Postal Code,Neighborhood,Borough,Latitude,Longitude
0,M1A,"Not assigned,",,43.806686,-79.194353
1,M1B,"Malvern, Rouge,",Scarborough,43.784535,-79.160497
2,M1C,"Rouge Hill, Port Union, Highland Creek,",Scarborough,43.763573,-79.188711
3,M1E,"Guildwood, Morningside, West Hill,",Scarborough,43.770992,-79.216917
4,M1G,"Woburn,",Scarborough,43.773136,-79.239476
5,M1H,"Cedarbrae,",Scarborough,43.744734,-79.239476
6,M1J,"Scarborough Village,",Scarborough,43.727929,-79.262029
7,M1K,"Kennedy Park, Ionview, East Birchmount Park,",Scarborough,43.711112,-79.284577
8,M1L,"Golden Mile, Clairlea, Oakridge,",Scarborough,43.716316,-79.239476
9,M1M,"Cliffside, Cliffcrest, Scarborough Village West,",Scarborough,43.692657,-79.264848


In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent tr_explorer, as shown below.

In [13]:
address = 'Toronto'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
print(location)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

Toronto, Golden Horseshoe, Ontario, M5H 2N2, Canada
The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [14]:
toronto_data = df_Canada_Group[df_Canada_Group['Neighborhood'] != 'Not assigned,'].reset_index(drop=True)
toronto_data1 = toronto_data.dropna()
toronto_data1

Unnamed: 0,Postal Code,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Malvern, Rouge,",Scarborough,43.784535,-79.160497
1,M1C,"Rouge Hill, Port Union, Highland Creek,",Scarborough,43.763573,-79.188711
2,M1E,"Guildwood, Morningside, West Hill,",Scarborough,43.770992,-79.216917
3,M1G,"Woburn,",Scarborough,43.773136,-79.239476
4,M1H,"Cedarbrae,",Scarborough,43.744734,-79.239476
5,M1J,"Scarborough Village,",Scarborough,43.727929,-79.262029
6,M1K,"Kennedy Park, Ionview, East Birchmount Park,",Scarborough,43.711112,-79.284577
7,M1L,"Golden Mile, Clairlea, Oakridge,",Scarborough,43.716316,-79.239476
8,M1M,"Cliffside, Cliffcrest, Scarborough Village West,",Scarborough,43.692657,-79.264848
9,M1N,"Birch Cliff, Cliffside West,",Scarborough,43.75741,-79.273304


Folium is a great visualization library. Feel free to zoom into the above map, and click on each circle mark to reveal the name of the neighborhood and its respective borough.

In [15]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data1['Latitude'], toronto_data1['Longitude'], toronto_data1['Borough'], toronto_data1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='#5A99A2',
        fill=True,
        fill_color='#3AC2CE',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [16]:

CLIENT_ID = '3CU31MEK1COG5ND15MTGT2O0SWSUV5TTEBDRO4ME5IVVNDHB' # your Foursquare ID
CLIENT_SECRET = 'VECWVZNJA0ONZHX3ZD2TGDGJ1PLDWGX34A1AOFEIEUXPD11P' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3CU31MEK1COG5ND15MTGT2O0SWSUV5TTEBDRO4ME5IVVNDHB
CLIENT_SECRET:VECWVZNJA0ONZHX3ZD2TGDGJ1PLDWGX34A1AOFEIEUXPD11P


However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in Central Toront. So let's slice the original dataframe and create a new dataframe of the Central Toronto data.

In [17]:
centraltoronto_data = toronto_data1[toronto_data1['Borough'] == 'Central Toronto'].reset_index(drop=True)
centraltoronto_data.head()

Unnamed: 0,Postal Code,Neighborhood,Borough,Latitude,Longitude
0,M4N,"Lawrence Park,",Central Toronto,43.648429,-79.38228
1,M4P,"Davisville North,",Central Toronto,43.718518,-79.464763
2,M4R,"North Toronto West, Lawrence Park,",Central Toronto,43.709577,-79.445073
3,M4S,"Davisville,",Central Toronto,43.693781,-79.428191
4,M4T,"Moore Park, Summerhill East,",Central Toronto,43.689026,-79.453512


In [18]:
column_names = ['Postal Code', 'Borough', 'Neighborhood','Latitude','Longitude']

# instantiate the dataframe
neighborhoods_ct = pd.DataFrame(columns=column_names)

# Iteración por filas del DataFrame:
for idx_row, row in centraltoronto_data.iterrows():
    Postal_Code = row[0]
    Borough = row[2]
    Latitude = row[3]
    Longitude = row[4]
    for Neighborhood in row[1].split(','):
        if Neighborhood !='':
            neighborhoods_ct= neighborhoods_ct.append({'Postal Code':Postal_Code,'Borough':Borough,'Neighborhood':Neighborhood,
                                                            'Latitude':Latitude,'Longitude':Longitude}, ignore_index=True)
neighborhoods_ct.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.648429,-79.38228
1,M4P,Central Toronto,Davisville North,43.718518,-79.464763
2,M4R,Central Toronto,North Toronto West,43.709577,-79.445073
3,M4R,Central Toronto,Lawrence Park,43.709577,-79.445073
4,M4S,Central Toronto,Davisville,43.693781,-79.428191



Get the neighborhood's latitude and longitude values.

In [19]:
address = 'Central Toronto,Toronto'

geolocator = Nominatim(user_agent="sb_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.


In [20]:
map_ct = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(neighborhoods_ct['Latitude'], centraltoronto_data['Longitude'], centraltoronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='#5A99A2',
        fill=True,
        fill_color='#3AC2CE',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ct)  
    
map_ct


 **Define Foursquare Credentials and Version**

In [21]:
CLIENT_ID = '3CU31MEK1COG5ND15MTGT2O0SWSUV5TTEBDRO4ME5IVVNDHB' # your Foursquare ID
CLIENT_SECRET = 'VECWVZNJA0ONZHX3ZD2TGDGJ1PLDWGX34A1AOFEIEUXPD11P' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3CU31MEK1COG5ND15MTGT2O0SWSUV5TTEBDRO4ME5IVVNDHB
CLIENT_SECRET:VECWVZNJA0ONZHX3ZD2TGDGJ1PLDWGX34A1AOFEIEUXPD11P


In [22]:
neighborhood_latitude = neighborhoods_ct.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods_ct.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods_ct.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Lawrence Park are 43.6484292, -79.3822802.


**Now, let's get the top 100 venues that are in Rouge within a radius of 500 metersNow, let's get the top 100 venues that are in Rouge within a radius of 500 meters**

In [23]:
# type your answer here

LIMIT = 1000 # limit of number of venues returned by Foursquare API
radius = 5000 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=3CU31MEK1COG5ND15MTGT2O0SWSUV5TTEBDRO4ME5IVVNDHB&client_secret=VECWVZNJA0ONZHX3ZD2TGDGJ1PLDWGX34A1AOFEIEUXPD11P&v=20180605&ll=43.6484292,-79.3822802&radius=5000&limit=1000'

Send the GET request and examine the resutls

In [24]:
results = requests.get(url).json()

From the Foursquare lab in the previous module, we know that all the information is in the items key. Before we proceed, let's borrow the get_category_type function from the Foursquare lab.

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


Now we are ready to clean the json and structure it into a pandas dataframe.

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Adelaide Club Toronto,Gym / Fitness Center,43.649279,-79.381921
1,Byblos Toronto,Mediterranean Restaurant,43.647615,-79.388381
2,Pai,Thai Restaurant,43.647923,-79.388579
3,Soho House Toronto,Speakeasy,43.648734,-79.386541
4,Downtown Toronto,Neighborhood,43.653232,-79.385296


# **Explore Neighborhoods in Toronto**

In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [28]:
CentralT_venues = getNearbyVenues(names=neighborhoods_ct['Neighborhood'],
                                   latitudes=neighborhoods_ct['Latitude'],
                                   longitudes=neighborhoods_ct['Longitude']
                                  )

Lawrence Park
Davisville North
North Toronto West
  Lawrence Park
Davisville
Moore Park
 Summerhill East
Summerhill West
 Rathnelly
 South Hill
 Forest Hill SE
 Deer Park
Roselawn
Forest Hill North & West
 Forest Hill Road Park
The Annex
 North Midtown
 Yorkville


In [29]:
print(CentralT_venues.shape)
CentralT_venues.head()

(262, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.648429,-79.38228,Adelaide Club Toronto,43.649279,-79.381921,Gym / Fitness Center
1,Lawrence Park,43.648429,-79.38228,Equinox Bay Street,43.6481,-79.379989,Gym
2,Lawrence Park,43.648429,-79.38228,Canoe,43.647452,-79.38132,Restaurant
3,Lawrence Park,43.648429,-79.38228,Pilot Coffee Roasters,43.648835,-79.380936,Coffee Shop
4,Lawrence Park,43.648429,-79.38228,The Keg Steakhouse + Bar - York Street,43.649987,-79.384103,Restaurant


In [30]:
CentralT_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Lawrence Park,5,5,5,5,5,5
Deer Park,17,17,17,17,17,17
Forest Hill Road Park,1,1,1,1,1,1
Forest Hill SE,17,17,17,17,17,17
North Midtown,14,14,14,14,14,14
Rathnelly,17,17,17,17,17,17
South Hill,17,17,17,17,17,17
Summerhill East,4,4,4,4,4,4
Yorkville,14,14,14,14,14,14
Davisville,4,4,4,4,4,4


In [31]:
print('There are {} uniques categories.'.format(len(CentralT_venues['Venue Category'].unique())))

There are 81 uniques categories.


# **Analyze Each Neighborhood**

In [32]:
CentralT_onehot = pd.get_dummies(CentralT_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
CentralT_onehot['Neighborhood'] = CentralT_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [CentralT_onehot.columns[-1]] + list(CentralT_onehot.columns[:-1])
CentralT_onehot = CentralT_onehot[fixed_columns]

CentralT_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Athletics & Sports,Baby Store,Bakery,Bar,Baseball Field,Beer Bar,Bookstore,Boutique,Brazilian Restaurant,Building,Burger Joint,Burrito Place,Café,Candy Store,Clothing Store,Coffee Shop,Colombian Restaurant,Concert Hall,Convenience Store,Deli / Bodega,Department Store,Diner,Discount Store,Event Space,Fast Food Restaurant,Field,Food Court,Furniture / Home Store,Gastropub,General Travel,Gluten-free Restaurant,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hardware Store,Hockey Arena,Hotel,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Lounge,Mediterranean Restaurant,Miscellaneous Shop,Monument / Landmark,Museum,New American Restaurant,Nightclub,Opera House,Park,Pizza Place,Playground,Plaza,Pool,Pub,Restaurant,River,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Speakeasy,Steakhouse,Supplement Shop,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [33]:
CentralT_grouped = CentralT_onehot.groupby('Neighborhood').mean().reset_index()
CentralT_grouped

Unnamed: 0,Neighborhood,Accessories Store,American Restaurant,Art Gallery,Asian Restaurant,Athletics & Sports,Baby Store,Bakery,Bar,Baseball Field,Beer Bar,Bookstore,Boutique,Brazilian Restaurant,Building,Burger Joint,Burrito Place,Café,Candy Store,Clothing Store,Coffee Shop,Colombian Restaurant,Concert Hall,Convenience Store,Deli / Bodega,Department Store,Diner,Discount Store,Event Space,Fast Food Restaurant,Field,Food Court,Furniture / Home Store,Gastropub,General Travel,Gluten-free Restaurant,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Hardware Store,Hockey Arena,Hotel,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Lounge,Mediterranean Restaurant,Miscellaneous Shop,Monument / Landmark,Museum,New American Restaurant,Nightclub,Opera House,Park,Pizza Place,Playground,Plaza,Pool,Pub,Restaurant,River,Salad Place,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Speakeasy,Steakhouse,Supplement Shop,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store
0,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Deer Park,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176471,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Forest Hill Road Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Forest Hill SE,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176471,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,North Midtown,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
5,Rathnelly,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176471,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,South Hill,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176471,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Summerhill East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25
8,Yorkville,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0
9,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0


Let's print each neighborhood along with the top 5 most common venues

In [34]:
num_top_venues = 5
for hood in CentralT_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = CentralT_grouped[CentralT_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----  Lawrence Park----
                 venue  freq
0  Japanese Restaurant   0.2
1                  Pub   0.2
2               Bakery   0.2
3           Playground   0.2
4                 Park   0.2


---- Deer Park----
           venue  freq
0  Grocery Store  0.24
1           Café  0.18
2           Park  0.12
3      Nightclub  0.06
4     Restaurant  0.06


---- Forest Hill Road Park----
               venue  freq
0     Baseball Field   1.0
1  Accessories Store   0.0
2          Nightclub   0.0
3         Restaurant   0.0
4                Pub   0.0


---- Forest Hill SE----
           venue  freq
0  Grocery Store  0.24
1           Café  0.18
2           Park  0.12
3      Nightclub  0.06
4     Restaurant  0.06


---- North Midtown----
                    venue  freq
0       Convenience Store  0.07
1  Thrift / Vintage Store  0.07
2    Fast Food Restaurant  0.07
3          Sandwich Place  0.07
4         Supplement Shop  0.07


---- Rathnelly----
           venue  freq
0  Grocery Store  0.24


Let's put that into a pandas dataframe


In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.



In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = CentralT_grouped['Neighborhood']

for ind in np.arange(CentralT_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(CentralT_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Lawrence Park,Playground,Park,Japanese Restaurant,Bakery,Pub,Women's Store,Event Space,Convenience Store,Deli / Bodega,Department Store
1,Deer Park,Grocery Store,Café,Park,Italian Restaurant,Nightclub,Diner,Coffee Shop,Candy Store,Restaurant,Athletics & Sports
2,Forest Hill Road Park,Baseball Field,Women's Store,Field,Convenience Store,Deli / Bodega,Department Store,Diner,Discount Store,Event Space,Fast Food Restaurant
3,Forest Hill SE,Grocery Store,Café,Park,Italian Restaurant,Nightclub,Diner,Coffee Shop,Candy Store,Restaurant,Athletics & Sports
4,North Midtown,Gym,Tanning Salon,Grocery Store,Burrito Place,Burger Joint,Discount Store,Sandwich Place,Fast Food Restaurant,Supplement Shop,Convenience Store
