# The Battle of the Neighborhoods
Coursera Capstone Project - Applied Data Science

Made by Vanderlei M. Pereira F.

## Import Necessary Packages

In [215]:
# library to handle data in a vectorized manner
import numpy as np 

# library for data analsysis
import pandas as pd
from scipy import stats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

# useful time functions library
import time

# library to handle requests
import requests

# matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# seaborn and associated plotting modules
import seaborn as sns

# cufflings and associated plotting modules
import cufflinks as cf

# plotly and associated plotting modules
import plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
import folium 

# import beautifulsoup for html data scrapping
from bs4 import BeautifulSoup

# import geocoder and geopy for geographic coordinates extraction
import geocoder
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim 


print('Libraries imported.')

Libraries imported.


## Configure used third-party services API Credentials

## Geocoder.Google Setup

In [2]:
# Write your google geocoder credentials in the variable below
GEOCODER_GOOGLE_KEY = 'AIzaSyBuF-099dPdT5xRIGpIeiP8ruccYSmrxKg'

### Plotly API Setup

In [3]:
# Write your Plotly credentials in the function below
py.tools.set_credentials_file(username='vmunhozpf', api_key='ya4uwAjcVvSKPmxuRejh')

### Foursquare API Setup

In [4]:
# Write your Foursquare credentials in the variables below
CLIENT_ID = 'QXLKOWPH4O3GJ0LX43FYI0FPVFT2GXNRVWPPMWS2CMD3VJRX' # your Foursquare ID
CLIENT_SECRET = 'RC1YHBP2JK41GL11EG32QOAKNVA2WKQAO20IBRLOWLIY2PFH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Import New York City boroughs and neighborhoods data from JSON file
The New York City data is provided by the Coursera team through the link in the comments in the cell below.

In [7]:
# Open New York City Data from provided .json file
# https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
with open('nyu_2451_34572-geojson.json') as json_data:
    newyork_data = json.load(json_data)
print('New York City data imported.')

# Create empty new york data pandas DataFrame
ny_neighborhoods = pd.DataFrame(columns=['Borough', 'Neighborhood', 'Latitude', 'Longitude'])

# Populate ny_neighborhoods_df with new york imported json data
for data in newyork_data['features']:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                                'Neighborhood': neighborhood_name,
                                                'Latitude': neighborhood_lat,
                                                'Longitude': neighborhood_lon}, 
                                                ignore_index=True)
print('Pandas DataFrame populated with New York City data.')

# Export data do csv file
ny_neighborhoods.to_csv('ny_neighborhoods.csv', sep=',', encoding='utf-8')

ny_neighborhoods.tail()

New York City data imported.
Pandas DataFrame populated with New York City data.


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
301,Manhattan,Hudson Yards,40.756658,-74.000111
302,Queens,Hammels,40.587338,-73.80553
303,Queens,Bayswater,40.611322,-73.765968
304,Queens,Queensbridge,40.756091,-73.945631
305,Staten Island,Fox Hills,40.617311,-74.08174


In [12]:
# Alternatively, import data directly from local .csv file prepared

colnames = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
ny_neighborhoods = pd.read_csv('ny_neighborhoods.csv', skiprows=1, names=colnames)
ny_neighborhoods.tail()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
301,Manhattan,Hudson Yards,40.756658,-74.000111
302,Queens,Hammels,40.587338,-73.80553
303,Queens,Bayswater,40.611322,-73.765968
304,Queens,Queensbridge,40.756091,-73.945631
305,Staten Island,Fox Hills,40.617311,-74.08174


## Scrap Toronto boroughs and neighborhoods data from Wikipedia article

In [9]:
# Scrap Toronto Data from Wikipedia
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Create BeautifulSoup object
soup = BeautifulSoup(source, "html.parser")

# Scrap wikipedia HTML data using BeautifulSoup
wiki_table = soup.find('table', {'class':'wikitable sortable'})
wiki_table_rows = wiki_table.findAll('tr')
res = []

# Get boroughs and neighborhoods names from wikipedia table
for tr in wiki_table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        if (row[1]!='Not assigned'):
            if (row[2]=='Not assigned'):
                row[2]=row[1]
            res.append(row)
print('Toronto data scrapped from Wikipedia.')

# Iterate through 'res' array and find coordinates for each row (borough)
print('Importing Toronto neighborhoods geographical coordinates using geocoder...')
for j in range(0, len(res)):
    
    # print progress
    if (j == int(0.1*(len(res)-1))):
        print('Geocoder loop 10% Complete.')
    if (j == int(0.2*(len(res)-1))):
        print('Geocoder loop 20% Complete.')
    if (j == int(0.3*(len(res)-1))):
        print('Geocoder loop 30% Complete.')
    if (j == int(0.4*(len(res)-1))):
        print('Geocoder loop 40% Complete.')
    if (j == int(0.5*(len(res)-1))):
        print('Geocoder loop 50% Complete.')
    if (j == int(0.6*(len(res)-1))):
        print('Geocoder loop 60% Complete.')
    if (j == int(0.7*(len(res)-1))):
        print('Geocoder loop 70% Complete.')
    if (j == int(0.8*(len(res)-1))):
        print('Geocoder loop 80% Complete.')
    if (j == int(0.9*(len(res)-1))):
        print('Geocoder loop 90% Complete.')
    if (j == int((len(res)-1))):
        print('Geocoder loop 100% Complete.')
    
    # send request
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(res[j][0]), key=GEOCODER_GOOGLE_KEY)
        lat_lng_coords = g.latlng
    
    # append coordinates to 'res' array
    res[j].append(lat_lng_coords[0])
    res[j].append(lat_lng_coords[1])            
             
# Populate to_neighborhoods_df with toronto scrapped data from wikipedia
to_neighborhoods = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighborhood", "Latitude", "Longitude"])
# Drop "Postcode" column
to_neighborhoods = to_neighborhoods.drop(columns='Postcode')
# Print alert
print('Pandas DataFrame populated with Toronto data.')
# Export data to local .csv file
to_neighborhoods.to_csv('to_neighborhoods.csv', sep=',', encoding='utf-8')
to_neighborhoods.tail()

Toronto data scrapped from Wikipedia.
Importing Toronto neighborhoods geographical coordinates using geocoder...
Geocoder loop 10% Complete.
Geocoder loop 20% Complete.
Geocoder loop 30% Complete.
Geocoder loop 40% Complete.
Geocoder loop 50% Complete.
Geocoder loop 60% Complete.
Geocoder loop 70% Complete.
Geocoder loop 80% Complete.
Geocoder loop 90% Complete.
Geocoder loop 100% Complete.
Pandas DataFrame populated with Toronto data.


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
207,Etobicoke,Kingsway Park South West,43.628841,-79.520999
208,Etobicoke,Mimico NW,43.628841,-79.520999
209,Etobicoke,The Queensway West,43.628841,-79.520999
210,Etobicoke,Royal York South West,43.628841,-79.520999
211,Etobicoke,South of Bloor,43.628841,-79.520999


In [14]:
# Alternatively, import data directly from local .csv file prepared

colnames = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
to_neighborhoods = pd.read_csv('to_neighborhoods.csv', skiprows=1, names=colnames)
to_neighborhoods.tail()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
207,Etobicoke,Kingsway Park South West,43.628841,-79.520999
208,Etobicoke,Mimico NW,43.628841,-79.520999
209,Etobicoke,The Queensway West,43.628841,-79.520999
210,Etobicoke,Royal York South West,43.628841,-79.520999
211,Etobicoke,South of Bloor,43.628841,-79.520999


## Visualizing the extracted borough and neighborhood data with Folium

In [15]:
def generate_map_of_city_boroughs_data(city_name, city_neighborhoods):
    
    # Find city geographical coordinates using geocode google API
    geolocator = Nominatim(user_agent="my_jupyter_notebook")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    city_location = geolocator.geocode(city_name) #'New York City, NY'
    city_latitude = city_location.latitude
    city_longitude = city_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(city_name, city_latitude, city_longitude))
    
    # Check number of Boroughs and Neighborhoods in the collected Dataset
    print('The "{}" dataframe has {} boroughs and {} neighborhoods.'.format(
          city_name,
          len(city_neighborhoods['Borough'].unique()),
          len(city_neighborhoods['Neighborhood'].unique())))
    
    # create map of city using latitude and longitude values
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, borough, neighborhood in zip(city_neighborhoods['Latitude'], city_neighborhoods['Longitude'], city_neighborhoods['Borough'], city_neighborhoods['Neighborhood']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7).add_to(map_city)  

    return map_city

## Map of New York City neighborhoods

In [16]:
generate_map_of_city_boroughs_data('New York City, NY', ny_neighborhoods)

The geographical coordinates of "New York City, NY" are 40.7308619, -73.9871558.
The "New York City, NY" dataframe has 5 boroughs and 302 neighborhoods.


## Map of Toronto neighborhoods

In [17]:
generate_map_of_city_boroughs_data('Toronto, ON', to_neighborhoods)

The geographical coordinates of "Toronto, ON" are 43.653963, -79.387207.
The "Toronto, ON" dataframe has 11 boroughs and 210 neighborhoods.


## Extracting venues data for each neighborhood using the Foursquare API

In [18]:
# getNearbyVenues() is a function made to get the top venues that are in each neighborhood within a radius of X meters
def getNearbyVenues(names, latitudes, longitudes, limit=200, radius=1000):
    
    venues_list=[]
    j = 0
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        
         # print progress
        if (j == int(0.1*(len(names)-1))):
            print('Foursquare loop 10% Complete.')
        if (j == int(0.2*(len(names)-1))):
            print('Foursquare loop 20% Complete.')
        if (j == int(0.3*(len(names)-1))):
            print('Foursquare loop 30% Complete.')
        if (j == int(0.4*(len(names)-1))):
            print('Foursquare loop 40% Complete.')
        if (j == int(0.5*(len(names)-1))):
            print('Foursquare loop 50% Complete.')
        if (j == int(0.6*(len(names)-1))):
            print('Foursquare loop 60% Complete.')
        if (j == int(0.7*(len(names)-1))):
            print('Foursquare loop 70% Complete.')
        if (j == int(0.8*(len(names)-1))):
            print('Foursquare loop 80% Complete.')
        if (j == int(0.9*(len(names)-1))):
            print('Foursquare loop 90% Complete.')
        if (j == int((len(names)-1))):
            print('Foursquare loop 100% Complete.')
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        j=j+1
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

## Extract venue data for all neighborhoods in Toronto

In [17]:
print('Importing Toronto neighborhoods nearby venues using Foursquare...')
# Get data from Foursquare
to_venues = getNearbyVenues(names=to_neighborhoods['Neighborhood'],
                            latitudes=to_neighborhoods['Latitude'],
                            longitudes=to_neighborhoods['Longitude'],
                            limit=200)

print('The "to_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(to_venues['Venue Category']),
      len(to_venues['Venue Category'].unique())))
to_venues.to_csv('to_venues.csv', sep=',', encoding='UTF8')
to_venues.head()

Importing Toronto neighborhoods nearby venues using Foursquare...
Foursquare loop 10% Complete.
Foursquare loop 20% Complete.
Foursquare loop 30% Complete.
Foursquare loop 40% Complete.
Foursquare loop 50% Complete.
Foursquare loop 60% Complete.
Foursquare loop 70% Complete.
Foursquare loop 80% Complete.
Foursquare loop 90% Complete.
Foursquare loop 100% Complete.
The "to_venues" dataframe has 9306 venues and 334 unique venue types.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W Canada,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,High Street Fish & Chips,43.74526,-79.324949,Fish & Chips Shop


In [20]:
# Alternatively, import data directly from local .csv file prepared

colnames = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
to_venues = pd.read_csv('to_venues.csv', skiprows=1, names=colnames)
to_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W Canada,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,High Street Fish & Chips,43.74526,-79.324949,Fish & Chips Shop


## Extract venue data for all neighborhoods in New York City

In [19]:
print('Importing New York City neighborhoods nearby venues using Foursquare...')
# Get data from Foursquare
ny_venues = getNearbyVenues(names=ny_neighborhoods['Neighborhood'],
                            latitudes=ny_neighborhoods['Latitude'],
                            longitudes=ny_neighborhoods['Longitude'],
                            limit=200)

print('The "ny_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(ny_venues['Venue Category']),
      len(ny_venues['Venue Category'].unique())))
ny_venues.to_csv('ny_venues.csv', sep=',', encoding='UTF8')
ny_venues.head()

Importing New York City neighborhoods nearby venues using Foursquare...
Foursquare loop 10% Complete.
Foursquare loop 20% Complete.
Foursquare loop 30% Complete.
Foursquare loop 40% Complete.
Foursquare loop 50% Complete.
Foursquare loop 60% Complete.
Foursquare loop 70% Complete.
Foursquare loop 80% Complete.
Foursquare loop 90% Complete.
Foursquare loop 100% Complete.
The "ny_venues" dataframe has 20536 venues and 466 unique venue types.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Ripe Kitchen & Bar,40.898152,-73.838875,Caribbean Restaurant
2,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
3,Wakefield,40.894705,-73.847201,Jackie's West Indian Bakery,40.889283,-73.84331,Caribbean Restaurant
4,Wakefield,40.894705,-73.847201,Ali's Roti Shop,40.894036,-73.856935,Caribbean Restaurant


In [21]:
# Alternatively, import data directly from local .csv file prepared

colnames = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
ny_venues = pd.read_csv('ny_venues.csv', skiprows=1, names=colnames)
ny_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Ripe Kitchen & Bar,40.898152,-73.838875,Caribbean Restaurant
2,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
3,Wakefield,40.894705,-73.847201,Jackie's West Indian Bakery,40.889283,-73.84331,Caribbean Restaurant
4,Wakefield,40.894705,-73.847201,Ali's Roti Shop,40.894036,-73.856935,Caribbean Restaurant


### New function for venue geographical visualization

In [22]:
def generate_map_of_city_venues_data(city_name, city_neighborhoods):
    
    # Find city geographical coordinates using geocode google API
    geolocator = Nominatim(user_agent="my_jupyter_notebook")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    city_location = geolocator.geocode(city_name) #'New York City, NY'
    city_latitude = city_location.latitude
    city_longitude = city_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(city_name, city_latitude, city_longitude))
    
    # Check number of Boroughs and Neighborhoods in the collected Dataset
    print('The "{}" dataframe has {} different venue types and {} neighborhoods.'.format(
          city_name,
          len(city_neighborhoods['Venue Category'].unique()),
          len(city_neighborhoods['Neighborhood'].unique())))
    
    # create map of city using latitude and longitude values
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, venue, category in zip(city_neighborhoods['Venue Latitude'], city_neighborhoods['Venue Longitude'], city_neighborhoods['Venue'], city_neighborhoods['Venue Category']):
        label = '{}, {}'.format(category, venue)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=0.1,
            popup=label,
            color='red',
            fill=True,
            fill_color='#FF0000',
            fill_opacity=0.3).add_to(map_city)  

    return map_city

### Toronto Venues greographical visualization

In [2]:
generate_map_of_city_venues_data('Toronto, ON', to_venues)

### New York City Venues greographical visualization

In [1]:
generate_map_of_city_venues_data('New York City, NY', ny_venues)

## Manually group Foursquare's venues categories found in New York City and Toronto

### For Toronto

In [244]:
""" # Code used to extract all unique venue categories in New York City
# Save unique categories list as a .csv file
# format as a block of csv text to do whatever you want
csv_rows = ["{}".format(i) for i in ny_venues['Venue Category'].unique()]
csv_text = "\n".join(csv_rows)

# write it to a file
with open('ny_unique_venues.csv', 'w') as f:
    f.write(csv_text)
"""

# Import the manually prepared data extracted with the code above
# encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'
colnames = ['BAR_CLUB', 'RESTAURANT', 'SERVICES', 'LEISURE_SPORTS', 'CULTURAL_SCHOOLS', 'PARKS_NATURE_RURAL', 'TRANSPORT_INFRASTRUCTURE', 'RESIDENTIAL']
to_unique_venues = pd.read_csv('to_unique_venues.csv', skiprows=1, names=colnames, encoding='latin1')

# Export columns to python lists
to_BAR_CLUB = to_unique_venues.BAR_CLUB.tolist()
to_BAR_CLUB = [x for x in to_BAR_CLUB if str(x) != 'nan']

to_RESTAURANT = to_unique_venues.RESTAURANT.tolist()
to_RESTAURANT = [x for x in to_RESTAURANT if str(x) != 'nan']

to_SERVICES = to_unique_venues.SERVICES.tolist()
to_SERVICES = [x for x in to_SERVICES if str(x) != 'nan']

to_LEISURE_SPORTS = to_unique_venues.LEISURE_SPORTS.tolist()
to_LEISURE_SPORTS = [x for x in to_LEISURE_SPORTS if str(x) != 'nan']

to_CULTURAL_SCHOOLS = to_unique_venues.CULTURAL_SCHOOLS.tolist()
to_CULTURAL_SCHOOLS = [x for x in to_CULTURAL_SCHOOLS if str(x) != 'nan']

to_PARKS_NATURE_RURAL = to_unique_venues.PARKS_NATURE_RURAL.tolist()
to_PARKS_NATURE_RURAL = [x for x in to_PARKS_NATURE_RURAL if str(x) != 'nan']

to_TRANSPORT_INFRASTRUCTURE = to_unique_venues.TRANSPORT_INFRASTRUCTURE.tolist()
to_TRANSPORT_INFRASTRUCTURE = [x for x in to_TRANSPORT_INFRASTRUCTURE if str(x) != 'nan']

to_RESIDENTIAL = to_unique_venues.RESIDENTIAL.tolist()
to_RESIDENTIAL = [x for x in to_RESIDENTIAL if str(x) != 'nan']

to_unique_venues.head()

Unnamed: 0,BAR_CLUB,RESTAURANT,SERVICES,LEISURE_SPORTS,CULTURAL_SCHOOLS,PARKS_NATURE_RURAL,TRANSPORT_INFRASTRUCTURE,RESIDENTIAL
0,Café,Caribbean Restaurant,Fish & Chips Shop,Skating Rink,Historic Site,Park,Road,Playground
1,Food & Drink Shop,Fast Food Restaurant,Grocery Store,Tennis Court,Theater,Plaza,Bus Stop,Neighborhood
2,Coffee Shop,Pizza Place,Pharmacy,Hockey Arena,Performing Arts Venue,Garden,Train Station,Housing Development
3,Lounge,Chinese Restaurant,Supermarket,Athletics & Sports,Art Gallery,Field,Intersection,Recreation Center
4,Breakfast Spot,Portuguese Restaurant,Convenience Store,Golf Course,Dance Studio,Beach,Bus Line,Residential Building (Apartment / Condo)


In [245]:
to_info = []
to_info.append(["Bars and Clubs", len(to_BAR_CLUB)])
to_info.append(["Restaurants", len(to_RESTAURANT)])
to_info.append(["Services", len(to_SERVICES)])
to_info.append(["Leisure and Sports", len(to_LEISURE_SPORTS)])
to_info.append(["Education and Culture", len(to_CULTURAL_SCHOOLS)])
to_info.append(["Nature and Parks", len(to_PARKS_NATURE_RURAL)])
to_info.append(["Transportation", len(to_TRANSPORT_INFRASTRUCTURE)])
to_info.append(["Residential", len(to_RESIDENTIAL)])

to_venues_info = pd.DataFrame(to_info, columns=["Category", "Unique Sub-Categories"])
to_venues_info

Unnamed: 0,Category,Unique Sub-Categories
0,Bars and Clubs,61
1,Restaurants,71
2,Services,97
3,Leisure and Sports,30
4,Education and Culture,31
5,Nature and Parks,23
6,Transportation,15
7,Residential,6


### For New York City

In [246]:
""" # Code used to extract all unique venue categories in Toronto
# Save unique categories list as a .csv file
# format as a block of csv text to do whatever you want
csv_rows = ["{}".format(i) for i in to_venues['Venue Category'].unique()]
csv_text = "\n".join(csv_rows)

# write it to a file
with open('to_unique_venues.csv', 'w') as f:
    f.write(csv_text)
"""

# Import the manually prepared data extracted with the code above
# encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'
colnames = ['BAR_CLUB', 'RESTAURANT', 'SERVICES', 'LEISURE_SPORTS', 'CULTURAL_SCHOOLS', 'PARKS_NATURE_RURAL', 'TRANSPORT_INFRASTRUCTURE', 'RESIDENTIAL']
ny_unique_venues = pd.read_csv('ny_unique_venues.csv', skiprows=1, names=colnames, encoding='latin1')

# Export columns to python lists
ny_BAR_CLUB = ny_unique_venues.BAR_CLUB.tolist()
ny_BAR_CLUB = [x for x in ny_BAR_CLUB if str(x) != 'nan']

ny_RESTAURANT = ny_unique_venues.RESTAURANT.tolist()
ny_RESTAURANT = [x for x in ny_RESTAURANT if str(x) != 'nan']

ny_SERVICES = ny_unique_venues.SERVICES.tolist()
ny_SERVICES = [x for x in ny_SERVICES if str(x) != 'nan']

ny_LEISURE_SPORTS = ny_unique_venues.LEISURE_SPORTS.tolist()
ny_LEISURE_SPORTS = [x for x in ny_LEISURE_SPORTS if str(x) != 'nan']

ny_CULTURAL_SCHOOLS = ny_unique_venues.CULTURAL_SCHOOLS.tolist()
ny_CULTURAL_SCHOOLS = [x for x in ny_CULTURAL_SCHOOLS if str(x) != 'nan']

ny_PARKS_NATURE_RURAL = ny_unique_venues.PARKS_NATURE_RURAL.tolist()
ny_PARKS_NATURE_RURAL = [x for x in ny_PARKS_NATURE_RURAL if str(x) != 'nan']

ny_TRANSPORT_INFRASTRUCTURE = ny_unique_venues.TRANSPORT_INFRASTRUCTURE.tolist()
ny_TRANSPORT_INFRASTRUCTURE = [x for x in ny_TRANSPORT_INFRASTRUCTURE if str(x) != 'nan']

ny_RESIDENTIAL = ny_unique_venues.RESIDENTIAL.tolist()
ny_RESIDENTIAL = [x for x in ny_RESIDENTIAL if str(x) != 'nan']

ny_unique_venues.head()

Unnamed: 0,BAR_CLUB,RESTAURANT,SERVICES,LEISURE_SPORTS,CULTURAL_SCHOOLS,PARKS_NATURE_RURAL,TRANSPORT_INFRASTRUCTURE,RESIDENTIAL
0,Street Food Gathering,Argentinian Restaurant,Print Shop,Basketball Court,Movie Theater,Park,Bus Station,Residential Building (Apartment / Condo)
1,Rock Club,Israeli Restaurant,Big Box Store,Gym / Fitness Center,Public Art,Trail,Harbor / Marina,Neighborhood
2,Gay Bar,Colombian Restaurant,Shop & Service,Gym,Art Gallery,Garden,Bus Line,Indoor Play Area
3,Poke Place,Burmese Restaurant,Tourist Information Center,Bowling Alley,Historic Site,Plaza,Gas Station,Playground
4,Adult Boutique,Czech Restaurant,Entertainment Service,Yoga Studio,Performing Arts Venue,Scenic Lookout,Train Station,Building


In [247]:
ny_info = []
ny_info.append(["Bars and Clubs", len(ny_BAR_CLUB)])
ny_info.append(["Restaurants", len(ny_RESTAURANT)])
ny_info.append(["Services", len(ny_SERVICES)])
ny_info.append(["Leisure and Sports", len(ny_LEISURE_SPORTS)])
ny_info.append(["Education and Culture", len(ny_CULTURAL_SCHOOLS)])
ny_info.append(["Nature and Parks", len(ny_PARKS_NATURE_RURAL)])
ny_info.append(["Transportation", len(ny_TRANSPORT_INFRASTRUCTURE)])
ny_info.append(["Residential", len(ny_RESIDENTIAL)])

ny_venues_info = pd.DataFrame(ny_info, columns=["Category", "Unique Sub-Categories"])
ny_venues_info

Unnamed: 0,Category,Unique Sub-Categories
0,Bars and Clubs,71
1,Restaurants,105
2,Services,132
3,Leisure and Sports,48
4,Education and Culture,46
5,Nature and Parks,27
6,Transportation,31
7,Residential,6


## Plot the number of unique sub-categories found for each "larger" category defined

In [248]:
trace1 = go.Bar(x=to_venues_info['Category'],
                y=to_venues_info['Unique Sub-Categories'],
                opacity=0.3,
                name="Unique Sub-Categories in Toronto")
trace2 = go.Bar(x=ny_venues_info['Category'],
                y=ny_venues_info['Unique Sub-Categories'],
                opacity=0.3,
                name="Unique Sub-Categories in New York City")

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

py.plotly.iplot(fig)

## Encode the venue data (preparation for statistical analysis)

### Encoder Function

In [249]:
def encode_venues_categories(dataframe):
    res = []
    for index, row in dataframe.iterrows():
        if row["Venue Category"] in (to_BAR_CLUB+ny_BAR_CLUB):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        1, 0, 0, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in (to_RESTAURANT+ny_RESTAURANT):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 1, 0, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in (to_SERVICES+ny_SERVICES):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 1, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in (to_LEISURE_SPORTS+ny_LEISURE_SPORTS):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 1, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in (to_CULTURAL_SCHOOLS+ny_CULTURAL_SCHOOLS):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 1, 0, 0, 0, 1])
        elif row["Venue Category"] in (to_PARKS_NATURE_RURAL+ny_PARKS_NATURE_RURAL):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 1, 0, 0, 1])
        elif row["Venue Category"] in (to_TRANSPORT_INFRASTRUCTURE+ny_TRANSPORT_INFRASTRUCTURE):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 0, 1, 0, 1])
        elif row["Venue Category"] in (to_RESIDENTIAL+ny_RESIDENTIAL):
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 0, 0, 1, 1])
        else:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 1, 0, 0, 0, 0, 0, 1])
    return res

#Neighborhood
#Neighborhood Latitude
#Neighborhood Longitude
#Venue
#Venue Latitude
#Venue Longitude
#Venue Category

### Create encoded dataframes for Toronto

In [297]:
# Create encoded venues dataframe
to_encoded_venues = pd.DataFrame(encode_venues_categories(to_venues), 
                                 columns=["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", 
                                          "Venue Latitude", "Venue Longitude", 
                                          "Bars and Clubs", "Restaurants", "Services", "Leisure and Sports",
                                          "Education and Culture", "Nature and Parks", "Transportation",
                                          "Residential", "Total Venues"])


# Create encoded grouped venues dataframe
to_encoded_grouped_venues = to_encoded_venues.groupby(['Neighborhood', 
                                                       'Neighborhood Latitude', 
                                                       'Neighborhood Longitude']).sum().sort_values(by=['Total Venues']).reset_index()
# Save Neighborhood column for later
to_encoded_grouped_venues_Neighborhood = to_encoded_grouped_venues['Neighborhood']
# Drop non-integer columns
to_encoded_grouped_venues = to_encoded_grouped_venues.drop(['Neighborhood Latitude', 
                                                            'Neighborhood Longitude',
                                                            'Venue Latitude',
                                                            'Venue Longitude',
                                                            'Neighborhood'], axis=1)

# Prepare encoded grouped venues dataframe for KMeans clustering
to_encoded_grouped_venues_std = to_encoded_venues.groupby(['Neighborhood', 
                                                           'Neighborhood Latitude', 
                                                           'Neighborhood Longitude']).mean().sort_values(by=['Total Venues']).reset_index()
# Save columns for later
to_encoded_grouped_venues_std_Neighborhood = to_encoded_grouped_venues_std['Neighborhood']
to_encoded_grouped_venues_std_Latitude = to_encoded_grouped_venues_std['Neighborhood Latitude']
to_encoded_grouped_venues_std_Longitude = to_encoded_grouped_venues_std['Neighborhood Longitude']
# Drop non-integer columns
to_encoded_grouped_venues_std = to_encoded_grouped_venues_std.drop(['Neighborhood Latitude', 
                                                                    'Neighborhood Longitude',
                                                                    'Venue Latitude',
                                                                    'Venue Longitude',
                                                                    'Neighborhood',
                                                                    'Total Venues'], axis=1)

In [298]:
to_encoded_venues.tail()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential,Total Venues
9301,South of Bloor,43.628841,-79.520999,43.6203,-79.523906,0,0,1,0,0,0,0,0,1
9302,South of Bloor,43.628841,-79.520999,43.6234,-79.528894,0,0,1,0,0,0,0,0,1
9303,South of Bloor,43.628841,-79.520999,43.621304,-79.526146,0,0,1,0,0,0,0,0,1
9304,South of Bloor,43.628841,-79.520999,43.62134,-79.526708,0,0,1,0,0,0,0,0,1
9305,South of Bloor,43.628841,-79.520999,43.635358,-79.5293,0,0,1,0,0,0,0,0,1


In [299]:
to_encoded_grouped_venues.tail()

Unnamed: 0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential,Total Venues
206,36,26,23,3,9,3,0,0,100
207,44,22,20,4,6,3,1,0,100
208,38,30,23,6,0,2,0,1,100
209,38,23,22,5,9,3,0,0,100
210,25,40,16,3,13,2,0,1,100


In [300]:
to_encoded_grouped_venues_std.tail()

Unnamed: 0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential
206,0.176471,0.352941,0.352941,0.0,0.0,0.058824,0.058824,0.0
207,0.0,0.0,0.571429,0.142857,0.0,0.142857,0.142857,0.0
208,0.307692,0.307692,0.230769,0.038462,0.038462,0.076923,0.0,0.0
209,0.285714,0.214286,0.452381,0.0,0.047619,0.0,0.0,0.0
210,0.25,0.4,0.16,0.03,0.13,0.02,0.0,0.01


### Create encoded dataframes for New York City

In [336]:
# Create encoded venues dataframe
ny_encoded_venues = pd.DataFrame(encode_venues_categories(ny_venues), 
                                 columns=["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", 
                                          "Venue Latitude", "Venue Longitude", 
                                          "Bars and Clubs", "Restaurants", "Services", "Leisure and Sports",
                                          "Education and Culture", "Nature and Parks", "Transportation",
                                          "Residential", "Total Venues"])

# Create encoded grouped venues dataframe
ny_encoded_grouped_venues = ny_encoded_venues.groupby(['Neighborhood', 
                                                       'Neighborhood Latitude', 
                                                       'Neighborhood Longitude']).sum().sort_values(by=['Total Venues']).reset_index()

# Save Neighborhood column for later
ny_encoded_grouped_venues_Neighborhood = ny_encoded_grouped_venues['Neighborhood']
# Drop non-integer columns
ny_encoded_grouped_venues = ny_encoded_grouped_venues.drop(['Neighborhood Latitude', 
                                                            'Neighborhood Longitude',
                                                            'Venue Latitude',
                                                            'Venue Longitude',
                                                            'Neighborhood'], axis=1)

# Prepare encoded grouped venues dataframe for KMeans clustering
ny_encoded_grouped_venues_std = ny_encoded_venues.groupby(['Neighborhood', 
                                                           'Neighborhood Latitude', 
                                                           'Neighborhood Longitude']).mean().sort_values(by=['Total Venues']).reset_index()
# Save Neighborhood column for later
ny_encoded_grouped_venues_std_Neighborhood = ny_encoded_grouped_venues_std['Neighborhood']
ny_encoded_grouped_venues_std_Latitude = ny_encoded_grouped_venues_std['Neighborhood Latitude']
ny_encoded_grouped_venues_std_Longitude = ny_encoded_grouped_venues_std['Neighborhood Longitude']
# Drop non-integer columns
ny_encoded_grouped_venues_std = ny_encoded_grouped_venues_std.drop(['Neighborhood Latitude', 
                                                                    'Neighborhood Longitude',
                                                                    'Venue Latitude',
                                                                    'Venue Longitude',
                                                                    'Neighborhood',
                                                                    'Total Venues'], axis=1)

In [302]:
ny_encoded_venues.tail()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential,Total Venues
20531,Fox Hills,40.617311,-74.08174,40.625058,-74.079428,0,0,1,0,0,0,0,0,1
20532,Fox Hills,40.617311,-74.08174,40.611261,-74.08885,0,1,0,0,0,0,0,0,1
20533,Fox Hills,40.617311,-74.08174,40.621716,-74.072229,0,1,0,0,0,0,0,0,1
20534,Fox Hills,40.617311,-74.08174,40.621363,-74.071165,0,0,0,0,0,0,1,0,1
20535,Fox Hills,40.617311,-74.08174,40.621543,-74.071498,0,0,0,0,0,0,1,0,1


In [303]:
ny_encoded_grouped_venues.tail()

Unnamed: 0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential,Total Venues
301,25,59,13,1,1,1,0,0,100
302,25,30,34,4,1,3,1,2,100
303,38,33,24,3,0,2,0,0,100
304,25,33,30,6,5,1,0,0,100
305,34,35,15,13,1,2,0,0,100


In [304]:
ny_encoded_grouped_venues_std.tail()

Unnamed: 0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential
301,0.2,0.31,0.25,0.19,0.02,0.03,0.0,0.0
302,0.3,0.41,0.22,0.02,0.03,0.0,0.0,0.02
303,0.3,0.24,0.24,0.07,0.07,0.08,0.0,0.0
304,0.23,0.39,0.23,0.08,0.02,0.04,0.01,0.0
305,0.34,0.35,0.15,0.13,0.01,0.02,0.0,0.0


## Visualizing the numbers of venues extracted for each city

### Venues bar chart for Toronto

In [258]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = to_encoded_venues['Neighborhood'].value_counts()#[:20]
series.head(3)

series.iplot(kind='bar', yTitle='Number of Venues', xTitle=None, title='Toronto numbers of venues per neighborhood',
             filename='toronto-bar-chart')

### Number of venues per neighborhood distribution in Toronto

In [259]:
group_labels = ['Toronto Distplot']
fig = ff.create_distplot([np.array(to_encoded_grouped_venues['Total Venues'].tolist())], group_labels )
py.plotly.iplot(fig, filename='Basic Distplot')

### Venues bar chart for New York City

In [260]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = ny_encoded_venues['Neighborhood'].value_counts()#[:20]
series.head(3)

series.iplot(kind='bar', yTitle='Number of Venues', xTitle=None, title='New York City numbers of venues per neighborhood',
             filename='newyork-bar-chart')

### Number of venues per neighborhood distribution in New York

In [261]:
group_labels = ['New York Distplot']
fig = ff.create_distplot([np.array(ny_encoded_grouped_venues['Total Venues'].tolist())], group_labels )
py.plotly.iplot(fig, filename='Basic Distplot')

## Number of Restaurants and Services Analysis

### Distribution of neighborhoods' numbers of restaurants in Toronto

In [262]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = to_encoded_grouped_venues['Restaurants'].value_counts()
series.head(3)

series.iplot(kind='bar', xTitle='Number of Restaurants', yTitle='Number of Neighborhoods', 
             title='Number of Neighborhoods with X number of Restaurants in Toronto',
             filename='toronto_rest-bar-chart')

In [263]:
group_labels = ['Toronto Distplot']
fig = ff.create_distplot([np.array(to_encoded_grouped_venues['Restaurants'].tolist())], group_labels )
py.plotly.iplot(fig, filename='Basic Distplot')

### Distribution of neighborhoods' numbers of restaurants in New York City

In [264]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = ny_encoded_grouped_venues['Restaurants'].value_counts()
series.head(3)

series.iplot(kind='bar', xTitle='Number of Restaurants', yTitle='Number of Neighborhoods', 
             title='Number of Neighborhoods with X number of Restaurants in New York City',
             filename='newyork_rest-bar-chart')

In [265]:
group_labels = ['New York Distplot']
fig = ff.create_distplot([np.array(ny_encoded_grouped_venues['Restaurants'].tolist())], group_labels )
py.plotly.iplot(fig, filename='Basic Distplot')

### Distribution Comparisson between restaurants in Toronto and NYC

In [266]:
# Add histogram data
x1 = np.array(to_encoded_grouped_venues['Restaurants'].tolist())
x2 = np.array(ny_encoded_grouped_venues['Restaurants'].tolist())

# Group data together
hist_data = [x1, x2]

group_labels = ['Toronto', 'NYC']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)

# Plot!
py.plotly.iplot(fig, filename='Distplot with Multiple Datasets')

### Distribution Comparisson between bars and clubs in Toronto and NYC

In [267]:
# Add histogram data
x1 = np.array(to_encoded_grouped_venues['Bars and Clubs'].tolist())
x2 = np.array(ny_encoded_grouped_venues['Bars and Clubs'].tolist())

# Group data together
hist_data = [x1, x2]

group_labels = ['Toronto', 'NYC']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)

# Plot!
py.plotly.iplot(fig, filename='Distplot with Multiple Datasets')

### Distribution Comparisson between service venues in Toronto and NYC

In [268]:
# Add histogram data
x1 = np.array(to_encoded_grouped_venues['Services'].tolist())
x2 = np.array(ny_encoded_grouped_venues['Services'].tolist())

# Group data together
hist_data = [x1, x2]

group_labels = ['Toronto', 'NYC']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels)

# Plot!
py.plotly.iplot(fig, filename='Distplot with Multiple Datasets')

## K-Means Clustering of Neighborhoods

### Clustering neighborhoods in New York City

In [305]:
# Copy the encoded grouped venues standarized dataframe
ny_clustered_neighborhoods = ny_encoded_grouped_venues_std

# Columns list
clmns = ["Bars and Clubs", "Restaurants", "Services", "Leisure and Sports",
         "Education and Culture", "Nature and Parks", "Transportation", "Residential"]
    
# Cluster the data
kmeans = KMeans(n_clusters=5, random_state=0).fit(ny_clustered_neighborhoods)
labels = kmeans.labels_

# Make the new Cluster column
ny_clustered_neighborhoods['Cluster'] = labels

# Add the column into our list
clmns.extend(['Cluster'])

# Lets analyze the clusters
ny_pie_clusters = ny_clustered_neighborhoods[clmns].groupby(['Cluster']).mean()
ny_pie_clusters

Unnamed: 0_level_0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.21764,0.343106,0.300926,0.042592,0.016387,0.030872,0.041015,0.007462
1,0.279928,0.226214,0.23302,0.085261,0.046315,0.06695,0.049356,0.012956
2,0.149446,0.125427,0.116549,0.0837,0.016799,0.270983,0.228957,0.00814
3,0.199853,0.209268,0.439199,0.050567,0.011946,0.030888,0.051456,0.006824
4,0.315527,0.377115,0.165586,0.048323,0.028788,0.034803,0.021777,0.00808


In [306]:
ny_pie_clusters.columns = [0,1,2,3,4,5,6,7]
ny_pie_clusters = ny_pie_clusters.T
ny_pie_clusters.columns = ['results_0', 'results_1', 'results_2', 'results_3', 'results_4']
ny_pie_clusters

Unnamed: 0,results_0,results_1,results_2,results_3,results_4
0,0.21764,0.279928,0.149446,0.199853,0.315527
1,0.343106,0.226214,0.125427,0.209268,0.377115
2,0.300926,0.23302,0.116549,0.439199,0.165586
3,0.042592,0.085261,0.0837,0.050567,0.048323
4,0.016387,0.046315,0.016799,0.011946,0.028788
5,0.030872,0.06695,0.270983,0.030888,0.034803
6,0.041015,0.049356,0.228957,0.051456,0.021777
7,0.007462,0.012956,0.00814,0.006824,0.00808


In [308]:
llabels = np.array(['Bars and Clubs', 'Restaurants', 'Services', 'Leisure and Sports',
                    'Education and Culture', 'Nature and Parks', 'Transportation', 'Residential'])
colors = np.array(['#d5f4e6', '#80ced6', '#618685', '#ffef96', '#50394c', '#b2b2b2', '#f4e1d2', '#fefbd8'])
ny_pie_clusters['labels'] = llabels
ny_pie_clusters['colors'] = colors
ny_pie_clusters

Unnamed: 0,results_0,results_1,results_2,results_3,results_4,labels,colors
0,0.21764,0.279928,0.149446,0.199853,0.315527,Bars and Clubs,#d5f4e6
1,0.343106,0.226214,0.125427,0.209268,0.377115,Restaurants,#80ced6
2,0.300926,0.23302,0.116549,0.439199,0.165586,Services,#618685
3,0.042592,0.085261,0.0837,0.050567,0.048323,Leisure and Sports,#ffef96
4,0.016387,0.046315,0.016799,0.011946,0.028788,Education and Culture,#50394c
5,0.030872,0.06695,0.270983,0.030888,0.034803,Nature and Parks,#b2b2b2
6,0.041015,0.049356,0.228957,0.051456,0.021777,Transportation,#f4e1d2
7,0.007462,0.012956,0.00814,0.006824,0.00808,Residential,#fefbd8


## Cluster 0 - NYC

In [309]:
trace = go.Pie(labels=ny_pie_clusters['labels'], 
               values=ny_pie_clusters['results_0'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=ny_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 1 - NYC

In [310]:
trace = go.Pie(labels=ny_pie_clusters['labels'], 
               values=ny_pie_clusters['results_1'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=ny_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 2 - NYC

In [311]:
trace = go.Pie(labels=ny_pie_clusters['labels'], 
               values=ny_pie_clusters['results_2'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=ny_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 3 - NYC

In [312]:
trace = go.Pie(labels=ny_pie_clusters['labels'], 
               values=ny_pie_clusters['results_3'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=ny_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 4 - NYC

In [313]:
trace = go.Pie(labels=ny_pie_clusters['labels'], 
               values=ny_pie_clusters['results_4'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=ny_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

### Clustering neighborhoods in Toronto

In [314]:
# Copy the encoded grouped venues standarized dataframe
to_clustered_neighborhoods = to_encoded_grouped_venues_std

# Columns list
clmns = ["Bars and Clubs", "Restaurants", "Services", "Leisure and Sports",
         "Education and Culture", "Nature and Parks", "Transportation", "Residential"]
    
# Cluster the data
kmeans = KMeans(n_clusters=5, random_state=0).fit(to_clustered_neighborhoods)
labels = kmeans.labels_

# Make the new Cluster column
to_clustered_neighborhoods['Cluster'] = labels

# Add the column into our list
clmns.extend(['Cluster'])

# Lets analyze the clusters
to_pie_clusters = to_clustered_neighborhoods[clmns].groupby(['Cluster']).mean()
to_pie_clusters

Unnamed: 0_level_0,Bars and Clubs,Restaurants,Services,Leisure and Sports,Education and Culture,Nature and Parks,Transportation,Residential
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.361622,0.275607,0.192588,0.050884,0.041167,0.044213,0.020304,0.013616
1,0.234322,0.389253,0.166915,0.063724,0.016045,0.112636,0.011736,0.005368
2,0.126733,0.121314,0.515741,0.076326,0.024926,0.087394,0.032663,0.014903
3,0.208889,0.0,0.011111,0.118889,0.057778,0.358889,0.244444,0.0
4,0.206426,0.295983,0.339679,0.063548,0.007766,0.040124,0.041297,0.005178


In [315]:
to_pie_clusters.columns = [0,1,2,3,4,5,6,7]
to_pie_clusters = to_pie_clusters.T
to_pie_clusters.columns = ['results_0', 'results_1', 'results_2', 'results_3', 'results_4']
to_pie_clusters

Unnamed: 0,results_0,results_1,results_2,results_3,results_4
0,0.361622,0.234322,0.126733,0.208889,0.206426
1,0.275607,0.389253,0.121314,0.0,0.295983
2,0.192588,0.166915,0.515741,0.011111,0.339679
3,0.050884,0.063724,0.076326,0.118889,0.063548
4,0.041167,0.016045,0.024926,0.057778,0.007766
5,0.044213,0.112636,0.087394,0.358889,0.040124
6,0.020304,0.011736,0.032663,0.244444,0.041297
7,0.013616,0.005368,0.014903,0.0,0.005178


In [316]:
llabels = np.array(['Bars and Clubs', 'Restaurants', 'Services', 'Leisure and Sports',
                    'Education and Culture', 'Nature and Parks', 'Transportation', 'Residential'])
colors = np.array(['#92a8d1', '#034f84', '#f7cac9', '#f7786b', '#deeaee', '#b1cbbb', '#eea29a', '#c94c4c'])
to_pie_clusters['labels'] = llabels
to_pie_clusters['colors'] = colors
to_pie_clusters

Unnamed: 0,results_0,results_1,results_2,results_3,results_4,labels,colors
0,0.361622,0.234322,0.126733,0.208889,0.206426,Bars and Clubs,#92a8d1
1,0.275607,0.389253,0.121314,0.0,0.295983,Restaurants,#034f84
2,0.192588,0.166915,0.515741,0.011111,0.339679,Services,#f7cac9
3,0.050884,0.063724,0.076326,0.118889,0.063548,Leisure and Sports,#f7786b
4,0.041167,0.016045,0.024926,0.057778,0.007766,Education and Culture,#deeaee
5,0.044213,0.112636,0.087394,0.358889,0.040124,Nature and Parks,#b1cbbb
6,0.020304,0.011736,0.032663,0.244444,0.041297,Transportation,#eea29a
7,0.013616,0.005368,0.014903,0.0,0.005178,Residential,#c94c4c


## Cluster 0 - TO

In [317]:
trace = go.Pie(labels=to_pie_clusters['labels'], 
               values=to_pie_clusters['results_0'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=to_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 1 - TO

In [318]:
trace = go.Pie(labels=to_pie_clusters['labels'], 
               values=to_pie_clusters['results_1'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=to_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 2 - TO

In [319]:
trace = go.Pie(labels=to_pie_clusters['labels'], 
               values=to_pie_clusters['results_2'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=to_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 3 - TO

In [320]:
trace = go.Pie(labels=to_pie_clusters['labels'], 
               values=to_pie_clusters['results_3'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=to_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Cluster 4 - TO

In [321]:
trace = go.Pie(labels=to_pie_clusters['labels'], 
               values=to_pie_clusters['results_4'], 
               hoverinfo='label+percent', 
               textfont=dict(size=20),
               marker=dict(colors=to_pie_clusters['colors']))
py.plotly.iplot([trace], filename='basic_pie_chart')

## Geographical visualization of clustered neighborhoods in New York City

### New map rendering function

In [322]:
# set color scheme for the clusters
ny_rainbow = ['#ffef96', '#d5f4e6', '#b2b2b2', '#618685', '#80ced6']
to_rainbow = ['#92a8d1', '#034f84', '#f7cac9', '#b1cbbb', '#c94c4c']

def generate_map_of_city_clustered_neighborhoods(city_name, city_neighborhoods, kclusters, rainbow):
    
    # Find city geographical coordinates using geocode google API
    geolocator = Nominatim(user_agent="my_jupyter_notebook")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    city_location = geolocator.geocode(city_name) #'New York City, NY'
    city_latitude = city_location.latitude
    city_longitude = city_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(city_name, city_latitude, city_longitude))
    
    # Check number of Boroughs and Neighborhoods in the collected Dataset
    print('The "{}" dataframe has {} clusters and {} neighborhoods.'.format(
          city_name,
          kclusters,
          len(city_neighborhoods['Neighborhood'].unique())))
    
    # create map of city using latitude and longitude values
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, neighborhood, cluster in zip(city_neighborhoods['Neighborhood Latitude'], 
                                               city_neighborhoods['Neighborhood Longitude'], 
                                               city_neighborhoods['Neighborhood'], 
                                               city_neighborhoods['Cluster']):
        label = folium.Popup(str(neighborhood)+', Cluster: '+str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_city)  

    return map_city

### Data preparation

In [323]:
# Add the neighborhoods columns back to the clustered dataframe
ny_clustered_neighborhoods['Neighborhood'] = ny_encoded_grouped_venues_std_Neighborhood
ny_clustered_neighborhoods['Neighborhood Latitude'] = ny_encoded_grouped_venues_std_Latitude
ny_clustered_neighborhoods['Neighborhood Longitude'] = ny_encoded_grouped_venues_std_Longitude

# Drop the venues columns from the clustered dataframe
ny_clustered_neighborhoods = ny_clustered_neighborhoods.drop(["Bars and Clubs", 
                                                              "Restaurants", 
                                                              "Services", 
                                                              "Leisure and Sports",
                                                              "Education and Culture", 
                                                              "Nature and Parks", 
                                                              "Transportation", 
                                                              "Residential"], axis=1)
ny_clustered_neighborhoods.tail()

Unnamed: 0,Cluster,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
301,0,Flatiron,40.739673,-73.990947
302,4,Flatbush,40.636326,-73.958401
303,1,Financial District,40.707107,-74.010665
304,0,Forest Hills,40.725264,-73.844475
305,4,Yorkville,40.77593,-73.947118


### New York City Map

In [324]:
generate_map_of_city_clustered_neighborhoods('New York City, NY', ny_clustered_neighborhoods, 5, ny_rainbow)

The geographical coordinates of "New York City, NY" are 40.7308619, -73.9871558.
The "New York City, NY" dataframe has 5 clusters and 302 neighborhoods.


## Geographical visualization of clustered neighborhoods in Toronto

### Data preparation

In [326]:
# Add the neighborhoods columns back to the clustered dataframe
to_clustered_neighborhoods['Neighborhood'] = to_encoded_grouped_venues_std_Neighborhood
to_clustered_neighborhoods['Neighborhood Latitude'] = to_encoded_grouped_venues_std_Latitude
to_clustered_neighborhoods['Neighborhood Longitude'] = to_encoded_grouped_venues_std_Longitude

# Drop the venues columns from the clustered dataframe
to_clustered_neighborhoods = to_clustered_neighborhoods.drop(["Bars and Clubs", 
                                                              "Restaurants", 
                                                              "Services", 
                                                              "Leisure and Sports",
                                                              "Education and Culture", 
                                                              "Nature and Parks", 
                                                              "Transportation", 
                                                              "Residential"], axis=1)
to_clustered_neighborhoods.tail()

Unnamed: 0,Cluster,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
206,4,Humbergate,43.739416,-79.588437
207,2,Humberlea,43.724766,-79.532242
208,0,Humewood-Cedarvale,43.693781,-79.428191
209,4,Fairview,43.778517,-79.346556
210,1,Yorkville,43.67271,-79.405678


### Toronto Map

In [327]:
generate_map_of_city_clustered_neighborhoods('Toronto, ON', to_clustered_neighborhoods, 5, to_rainbow)

The geographical coordinates of "Toronto, ON" are 43.653963, -79.387207.
The "Toronto, ON" dataframe has 5 clusters and 209 neighborhoods.


## List Neighborhoods of Interest in Toronto

### Cluster 4 : Untapped Markets

In [331]:
to_clustered_neighborhoods.loc[to_clustered_neighborhoods['Cluster'] == 4, to_clustered_neighborhoods.columns[[1] + list(range(2, to_clustered_neighborhoods.shape[1]))]].Neighborhood

6              Richview Gardens
10                     Roselawn
12                        Rouge
14        Royal York South West
18      Scarborough Town Centre
22                  Silverstone
26                Parkview Hill
29                    Mimico NW
35                  Mount Olive
47                       Oriole
51                     Maryvale
53                South Steeles
66              Wexford Heights
70               Wilson Heights
72             Woodbine Gardens
75              York Mills West
76                      Wexford
78             Thorncliffe Park
79                  Thistletown
80               South of Bloor
83                 St. Phillips
85                 Steeles West
96             The Beaches West
102          The Queensway West
105        Martin Grove Gardens
108              Cliffside West
122             Downsview North
123         Downsview Northwest
124              Downsview West
126        East Birchmount Park
129                 Dorset Park
135     

### Cluster 3 : Saturated Markets

In [332]:
to_clustered_neighborhoods.loc[to_clustered_neighborhoods['Cluster'] == 3, to_clustered_neighborhoods.columns[[1] + list(range(2, to_clustered_neighborhoods.shape[1]))]].Neighborhood

2          Railway Lands
21          Silver Hills
74            York Mills
103        South Niagara
138        Bathurst Quay
148             CN Tower
166     King and Spadina
176        Lawrence Park
183       Island airport
198    Harbourfront West
Name: Neighborhood, dtype: object

## List Neighborhoods of Interest in New York City

### Cluster 4 : Untapped Markets

In [342]:
ny_clustered_neighborhoods.loc[(ny_clustered_neighborhoods['Cluster'] == 4)].Neighborhood

7           Oakland Gardens
9                North Side
10          North Riverdale
11             North Corona
12                     Noho
25         Prospect Heights
32           Pelham Parkway
37             New Brighton
38              Murray Hill
41           Manhattanville
42         Manhattan Valley
44          Manhattan Beach
47          Lower East Side
58              Murray Hill
65      Morningside Heights
69            Midtown South
74               Ravenswood
78          Upper West Side
82               Turtle Bay
83               Tudor City
86              Tottenville
89              Throgs Neck
93        Sunnyside Gardens
102            Williamsburg
104              Whitestone
107            West Village
109           West Brighton
112               Sunnyside
114         Stuyvesant Town
120        Roosevelt Island
122          Rockaway Beach
124               Riverdale
125               Ridgewood
131           Schuylerville
133          Sheepshead Bay
134                S

### Cluster 0 : Saturated Markets

In [341]:
ny_clustered_neighborhoods.loc[(ny_clustered_neighborhoods['Cluster'] == 0)].Neighborhood

0                       Allerton
2                      Olinville
3                       Old Town
4                  Ocean Parkway
5                     Ocean Hill
8                        Norwood
14                      New Lots
16               Paerdegat Basin
18                     Park Hill
19                   Parkchester
21               Queensboro Hill
23           Prospect Park South
24     Prospect Lefferts Gardens
26                  Prince's Bay
27                 Port Richmond
35                    Park Slope
43             Manhattan Terrace
46                       Madison
48                      Longwood
50                   Little Neck
56              Mariner's Harbor
57                       Melrose
60                    Mount Eden
61                    Mott Haven
63                   Morris Park
68                       Midwood
76                     Rego Park
77                        Utopia
80            University Heights
81                     Unionport
84        