# The Battle of the Neighborhoods
Coursera Capstone Project - Applied Data Science

Made by Vanderlei M. Pereira F.

## Import Necessary Packages

In [48]:
# library to handle data in a vectorized manner
import numpy as np 

# library for data analsysis
import pandas as pd
from scipy import stats
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

# useful time functions library
import time

# library to handle requests
import requests

# matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# seaborn and associated plotting modules
import seaborn as sns

# cufflings and associated plotting modules
import cufflinks as cf

# plotly and associated plotting modules
import plotly as py
import plotly.graph_objs as go

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
import folium 

# import beautifulsoup for html data scrapping
from bs4 import BeautifulSoup

# import geocoder and geopy for geographic coordinates extraction
import geocoder
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim 


print('Libraries imported.')

Libraries imported.


## Configure used third-party services API Credentials

## Geocoder.Google Setup

In [49]:
# Write your google geocoder credentials in the variable below
GEOCODER_GOOGLE_KEY = 'AIzaSyBuF-099dPdT5xRIGpIeiP8ruccYSmrxKg'

### Plotly API Setup

In [3]:
# Write your Plotly credentials in the function below
py.tools.set_credentials_file(username='vmunhozpf', api_key='ya4uwAjcVvSKPmxuRejh')

### Foursquare API Setup

In [None]:
# Write your Foursquare credentials in the variables below
CLIENT_ID = 'QXLKOWPH4O3GJ0LX43FYI0FPVFT2GXNRVWPPMWS2CMD3VJRX' # your Foursquare ID
CLIENT_SECRET = 'RC1YHBP2JK41GL11EG32QOAKNVA2WKQAO20IBRLOWLIY2PFH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Import New York City boroughs and neighborhoods data from JSON file
The New York City data is provided by the Coursera team through the link in the comments in the cell below.

In [4]:
# Open New York City Data from provided .json file
# https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
with open('nyu_2451_34572-geojson.json') as json_data:
    newyork_data = json.load(json_data)
print('New York City data imported.')

# Create empty new york data pandas DataFrame
ny_neighborhoods = pd.DataFrame(columns=['Borough', 'Neighborhood', 'Latitude', 'Longitude'])

# Populate ny_neighborhoods_df with new york imported json data
for data in newyork_data['features']:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    ny_neighborhoods = ny_neighborhoods.append({'Borough': borough,
                                                'Neighborhood': neighborhood_name,
                                                'Latitude': neighborhood_lat,
                                                'Longitude': neighborhood_lon}, 
                                                ignore_index=True)
print('Pandas DataFrame populated with New York City data.')
ny_neighborhoods.tail()

New York City data imported.
Pandas DataFrame populated with New York City data.


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
301,Manhattan,Hudson Yards,40.756658,-74.000111
302,Queens,Hammels,40.587338,-73.80553
303,Queens,Bayswater,40.611322,-73.765968
304,Queens,Queensbridge,40.756091,-73.945631
305,Staten Island,Fox Hills,40.617311,-74.08174


## Scrap Toronto boroughs and neighborhoods data from Wikipedia article

In [5]:
# Scrap Toronto Data from Wikipedia
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# Create BeautifulSoup object
soup = BeautifulSoup(source, "html.parser")

# Scrap wikipedia HTML data using BeautifulSoup
wiki_table = soup.find('table', {'class':'wikitable sortable'})
wiki_table_rows = wiki_table.findAll('tr')
res = []

# Get boroughs and neighborhoods names from wikipedia table
for tr in wiki_table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        if (row[1]!='Not assigned'):
            if (row[2]=='Not assigned'):
                row[2]=row[1]
            res.append(row)
print('Toronto data scrapped from Wikipedia.')

# Iterate through 'res' array and find coordinates for each row (borough)
print('Importing Toronto neighborhoods geographical coordinates using geocoder...')
for j in range(0, len(res)):
    
    # print progress
    if (j == int(0.1*(len(res)-1))):
        print('Geocoder loop 10% Complete.')
    if (j == int(0.2*(len(res)-1))):
        print('Geocoder loop 20% Complete.')
    if (j == int(0.3*(len(res)-1))):
        print('Geocoder loop 30% Complete.')
    if (j == int(0.4*(len(res)-1))):
        print('Geocoder loop 40% Complete.')
    if (j == int(0.5*(len(res)-1))):
        print('Geocoder loop 50% Complete.')
    if (j == int(0.6*(len(res)-1))):
        print('Geocoder loop 60% Complete.')
    if (j == int(0.7*(len(res)-1))):
        print('Geocoder loop 70% Complete.')
    if (j == int(0.8*(len(res)-1))):
        print('Geocoder loop 80% Complete.')
    if (j == int(0.9*(len(res)-1))):
        print('Geocoder loop 90% Complete.')
    if (j == int((len(res)-1))):
        print('Geocoder loop 100% Complete.')
    
    # send request
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(res[j][0]), key=GEOCODER_GOOGLE_KEY)
        lat_lng_coords = g.latlng
    
    # append coordinates to 'res' array
    res[j].append(lat_lng_coords[0])
    res[j].append(lat_lng_coords[1])            
             
# Populate to_neighborhoods_df with toronto scrapped data from wikipedia
to_neighborhoods = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighborhood", "Latitude", "Longitude"])
# Drop "Postcode" column
to_neighborhoods = to_neighborhoods.drop(columns='Postcode')
print('Pandas DataFrame populated with Toronto data.')
to_neighborhoods.tail()

Toronto data scrapped from Wikipedia.
Importing Toronto neighborhoods geographical coordinates using geocoder...
Geocoder loop 10% Complete.
Geocoder loop 20% Complete.
Geocoder loop 30% Complete.
Geocoder loop 40% Complete.
Geocoder loop 50% Complete.
Geocoder loop 60% Complete.
Geocoder loop 70% Complete.
Geocoder loop 80% Complete.
Geocoder loop 90% Complete.
Geocoder loop 100% Complete.
Pandas DataFrame populated with Toronto data.


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
207,Etobicoke,Kingsway Park South West,43.628841,-79.520999
208,Etobicoke,Mimico NW,43.628841,-79.520999
209,Etobicoke,The Queensway West,43.628841,-79.520999
210,Etobicoke,Royal York South West,43.628841,-79.520999
211,Etobicoke,South of Bloor,43.628841,-79.520999


## Visualizing the extracted borough and neighborhood data with Folium

In [6]:
def generate_map_of_city_boroughs_data(city_name, city_neighborhoods):
    
    # Find city geographical coordinates using geocode google API
    geolocator = Nominatim(user_agent="my_jupyter_notebook")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    city_location = geolocator.geocode(city_name) #'New York City, NY'
    city_latitude = city_location.latitude
    city_longitude = city_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(city_name, city_latitude, city_longitude))
    
    # Check number of Boroughs and Neighborhoods in the collected Dataset
    print('The "{}" dataframe has {} boroughs and {} neighborhoods.'.format(
          city_name,
          len(city_neighborhoods['Borough'].unique()),
          len(city_neighborhoods['Neighborhood'].unique())))
    
    # create map of city using latitude and longitude values
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, borough, neighborhood in zip(city_neighborhoods['Latitude'], city_neighborhoods['Longitude'], city_neighborhoods['Borough'], city_neighborhoods['Neighborhood']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7).add_to(map_city)  

    return map_city

## Map of New York City neighborhoods

In [7]:
generate_map_of_city_boroughs_data('New York City, NY', ny_neighborhoods)

The geographical coordinates of "New York City, NY" are 40.7308619, -73.9871558.
The "New York City, NY" dataframe has 5 boroughs and 302 neighborhoods.


## Map of Toronto neighborhoods

In [8]:
generate_map_of_city_boroughs_data('Toronto, ON', to_neighborhoods)

The geographical coordinates of "Toronto, ON" are 43.653963, -79.387207.
The "Toronto, ON" dataframe has 11 boroughs and 210 neighborhoods.


## Extracting venues data for each neighborhood using the Foursquare API

In [10]:
# getNearbyVenues() is a function made to get the top venues that are in each neighborhood within a radius of X meters
def getNearbyVenues(names, latitudes, longitudes, limit=200, radius=500):
    
    venues_list=[]
    j = 0
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        
         # print progress
        if (j == int(0.1*(len(names)-1))):
            print('Foursquare loop 10% Complete.')
        if (j == int(0.2*(len(names)-1))):
            print('Foursquare loop 20% Complete.')
        if (j == int(0.3*(len(names)-1))):
            print('Foursquare loop 30% Complete.')
        if (j == int(0.4*(len(names)-1))):
            print('Foursquare loop 40% Complete.')
        if (j == int(0.5*(len(names)-1))):
            print('Foursquare loop 50% Complete.')
        if (j == int(0.6*(len(names)-1))):
            print('Foursquare loop 60% Complete.')
        if (j == int(0.7*(len(names)-1))):
            print('Foursquare loop 70% Complete.')
        if (j == int(0.8*(len(names)-1))):
            print('Foursquare loop 80% Complete.')
        if (j == int(0.9*(len(names)-1))):
            print('Foursquare loop 90% Complete.')
        if (j == int((len(names)-1))):
            print('Foursquare loop 100% Complete.')
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        j=j+1
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude', 
                             'Neighborhood Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude', 
                             'Venue Category']
    
    return(nearby_venues)

## Extract venue data for all neighborhoods in Toronto

In [11]:
print('Importing Toronto neighborhoods nearby venues using Foursquare...')
# Get data from Foursquare
to_venues = getNearbyVenues(names=to_neighborhoods['Neighborhood'],
                            latitudes=to_neighborhoods['Latitude'],
                            longitudes=to_neighborhoods['Longitude'],
                            limit=200)

print('The "to_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(to_venues['Venue Category']),
      len(to_venues['Venue Category'].unique())))
to_venues.head()

Importing Toronto neighborhoods nearby venues using Foursquare...
Foursquare loop 10% Complete.
Foursquare loop 20% Complete.
Foursquare loop 30% Complete.
Foursquare loop 40% Complete.
Foursquare loop 50% Complete.
Foursquare loop 60% Complete.
Foursquare loop 70% Complete.
Foursquare loop 80% Complete.
Foursquare loop 90% Complete.
Foursquare loop 100% Complete.
The "to_venues" dataframe has 4443 venues and 268 unique venue types.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


## Extract venue data for all neighborhoods in New York City

In [14]:
print('Importing New York City neighborhoods nearby venues using Foursquare...')
# Get data from Foursquare
ny_venues = getNearbyVenues(names=ny_neighborhoods['Neighborhood'],
                            latitudes=ny_neighborhoods['Latitude'],
                            longitudes=ny_neighborhoods['Longitude'],
                            limit=200)

print('The "ny_venues" dataframe has {} venues and {} unique venue types.'.format(
      len(ny_venues['Venue Category']),
      len(ny_venues['Venue Category'].unique())))
ny_venues.head()

Importing New York City neighborhoods nearby venues using Foursquare...
Foursquare loop 10% Complete.
Foursquare loop 20% Complete.
Foursquare loop 30% Complete.
Foursquare loop 40% Complete.
Foursquare loop 50% Complete.
Foursquare loop 60% Complete.
Foursquare loop 70% Complete.
Foursquare loop 80% Complete.
Foursquare loop 90% Complete.
Foursquare loop 100% Complete.
The "ny_venues" dataframe has 10225 venues and 426 unique venue types.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898283,-73.850478,Caribbean Restaurant
3,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
4,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop


## Manually group Foursquare's venues categories found in New York City

In [15]:
# ny_venues['Venue Category'].unique()

# University
ny_university = ['School', 'College Academic Building', 'High School', 'College Bookstore', 'College Theater', 
                 'College Cafeteria', 'College Basketball Court']

# Services
ny_urban_services = ['Pharmacy', 'Salon / Barbershop', 'Bakery', 'Plaza', 'Bank', 'Home Service', 'Rental Car Location', 
                     'Eye Doctor', 'Health & Beauty Service', 'Market', 'Gas Station', 'Distillery', 'Brewery',
                     'Farmers Market', 'Recycling Facility', 'Waste Facility', 'Fish Market', 'Spa', 
                     'Check Cashing Service', 'Tattoo Parlor', 'Lawyer', 'Laundromat', 'Locksmith', 'Butcher',
                     'Laundry Service', 'Coworking Space', 'Nail Salon', 'Organic Grocery', 'Automotive Shop',
                     'Neighborhood', 'Non-Profit', "Doctor's Office", 'Hardware Store', 'Event Space', 
                     'Residential Building (Apartment / Condo)', 'Other Repair Shop', 'Massage Studio', 'Event Service',
                     'Motorcycle Shop', 'Bike Rental / Bike Share', 'Shop & Service', 'Business Service',
                     'Medical Center', 'Health Food Store', 'Photography Studio', 'Piercing Parlor', 'Adult Boutique',
                     'Weight Loss Center', 'Board Shop', 'Veterinarian', 'Daycare', 'Dry Cleaner', 'Drugstore', 'Office',
                     'Flea Market', 'Shoe Repair', 'Pet Café', 'Auto Workshop', 'Animal Shelter', 'Newsstand',
                     'Design Studio', 'Construction & Landscaping', 'Pet Service', 'Rental Service', 'Post Office', 
                     'Motel', 'Storage Facility', 'Recording Studio', 'Tanning Salon', 'IT Services', 'Auto Garage',
                     'Insurance Office', 'Community Center', 'Tech Startup', 'Factory', 'Stables']

# Restaurant
ny_restaurants = ['Caribbean Restaurant', 'Pizza Place', 'Restaurant', 'Chinese Restaurant', 'Fried Chicken Joint', 
                  'Diner', 'Seafood Restaurant', 'Fast Food Restaurant', 'Gourmet Shop', 'Latin American Restaurant',
                  'Burger Joint', 'Mexican Restaurant', 'Spanish Restaurant', 'Wings Joint', 'Comfort Food Restaurant', 
                  'Steakhouse', 'Italian Restaurant', 'Indian Restaurant', 'American Restaurant', 'Sushi Restaurant', 
                  'French Restaurant', 'African Restaurant', 'Food', 'Greek Restaurant', 'Paella Restaurant',
                  'Fish & Chips Shop', 'Asian Restaurant', 'Peruvian Restaurant', 'BBQ Joint', 'South American Restaurant', 
                  'Arepa Restaurant', 'Food & Drink Shop', 'Mediterranean Restaurant', 'Japanese Restaurant', 
                  'Thai Restaurant', 'Middle Eastern Restaurant', 'New American Restaurant', 'Vietnamese Restaurant',
                  'Dim Sum Restaurant', 'Shabu-Shabu Restaurant', 'Noodle House', 'Hotpot Restaurant', 
                  'Dumpling Restaurant', 'Polish Restaurant', 'Ramen Restaurant', 'Vegetarian / Vegan Restaurant', 
                  'Falafel Restaurant', 'Korean Restaurant', 'Eastern European Restaurant', 'Russian Restaurant', 
                  'Varenyky restaurant', 'Turkish Restaurant', 'Food Court', 'Southern / Soul Food Restaurant', 
                  'Cajun / Creole Restaurant', 'Pakistani Restaurant', 'Ethiopian Restaurant', 'Argentinian Restaurant',
                  'Filipino Restaurant', 'Cuban Restaurant', 'Israeli Restaurant', 'Tapas Restaurant',
                  'German Restaurant', 'Cantonese Restaurant', 'Halal Restaurant', 'Shanghai Restaurant',
                  'Kebab Restaurant', 'Tex-Mex Restaurant', 'Taiwanese Restaurant', 'Lebanese Restaurant',
                  'Poke Place', 'Jewish Restaurant', 'English Restaurant', 'Malay Restaurant', 'Austrian Restaurant',
                  'Japanese Curry Restaurant', 'Afghan Restaurant', 'Czech Restaurant', 'Australian Restaurant',
                  'South Indian Restaurant', 'Szechuan Restaurant', 'Hawaiian Restaurant', 'Brazilian Restaurant',
                  'Udon Restaurant', 'Gluten-free Restaurant', 'Moroccan Restaurant', 'Swiss Restaurant', 
                  'Modern European Restaurant', 'Belgian Restaurant', 'North Indian Restaurant', 'Himalayan Restaurant', 
                  'Empanada Restaurant', 'Colombian Restaurant', 'Indonesian Restaurant', 'Romanian Restaurant', 
                  'Kosher Restaurant', 'Sri Lankan Restaurant', 'Tibetan Restaurant', 'Venezuelan Restaurant', 
                  'Molecular Gastronomy Restaurant', 'Persian Restaurant', 'Cambodian Restaurant', 'Soba Restaurant',
                  'Portuguese Restaurant']
 
# Shopping
ny_shopping = ['Discount Store', 'Shopping Mall', 'Mattress Store', 'Grocery Store', 'Liquor Store', 'Gift Shop', 
               'Accessories Store', 'Convenience Store', 'Warehouse Store', 'Thrift / Vintage Store', 'Supermarket',
               'Shoe Store', 'Supplement Shop', 'Video Store', 'Shipping Store', 'Pet Store', 'Bagel Shop', 
               'Department Store', 'Video Game Store', 'Cosmetics Shop', "Men's Store", 'Electronics Store', 
               'Frozen Yogurt Shop', 'Wine Shop', 'Optical Shop', 'Miscellaneous Shop', 'Mobile Phone Shop', 
               'Sporting Goods Shop', 'Clothing Store', 'Kids Store', 'Paper / Office Supplies Store', 'Outlet Store',
               'Music Store', 'Flower Shop', 'Furniture / Home Store', "Women's Store", 'Antique Shop', 'Cheese Shop',
               'Fruit & Vegetable Store', 'Hobby Shop', 'Print Shop', 'Lingerie Store', 'Bridal Shop', 'Record Shop',
               'Arts & Crafts Store', 'Boutique', 'Vape Store', 'Food Stand', 'Chocolate Shop', 'Comic Shop',
               'Herbs & Spices Store', 'Bike Shop', 'Jewelry Store', 'Tailor Shop', 'Watch Shop', 'Souvlaki Shop',
               'Bed & Breakfast', 'Toll Plaza', 'Big Box Store', 'Baby Store', 'Camera Store', 'Duty-free Shop',
               'Smoothie Shop']

# Culture
ny_culture = ['Performing Arts Venue', 'Music Venue', 'History Museum', 'Art Gallery', 'Art Museum', 'Dance Studio', 
              'Bookstore', 'Museum', 'Used Bookstore', 'Indie Theater', 'Bistro', 'Indie Movie Theater',
              'Opera House','Theater', 'Concert Hall', 'Arts & Entertainment', 'Library', 'Music School', 'Public Art',
              'Street Art', 'Exhibit', 'Cultural Center', 'Auditorium', 'Multiplex']

# Sports
ny_sports = ['Baseball Field', 'Bowling Alley', 'Gym', 'Yoga Studio', 'Tennis Stadium', 'Athletics & Sports',
             'Gym / Fitness Center', 'Basketball Court', 'Sports Club', 'Martial Arts Dojo', 'Pool', 'Tennis Court',
             'Pool Hall', 'Stadium', 'Gymnastics Gym', 'Racetrack', 'Pilates Studio', 'Cycle Studio', 'Gym Pool',
             'Baseball Stadium', 'Skating Rink', 'Golf Course', 'Soccer Field', 'Boxing Gym', 'Climbing Gym', 
             'Mini Golf', 'Volleyball Court', 'Skate Park', 'Roller Rink']

# Getting Out
ny_getting_out = ['Dessert Shop', 'Ice Cream Shop', 'Donut Shop', 'Sandwich Place', 'Food Truck', 'Deli / Bodega',
                  'Juice Bar', 'Rock Club', 'Pub', 'Beer Bar', 'Bar', 'Coffee Shop', 'Breakfast Spot', 'Candy Store', 
                  'Café', 'Sports Bar', 'Soup Place', 'Nightclub', 'Arcade', 'Smoke Shop', 'Cupcake Shop', 'Hookah Bar',
                  'Buffet', 'Piano Bar', 'Dive Bar', 'Other Nightlife', 'Taco Place', 'Tea Room', 'Snack Place', 
                  'Karaoke Bar', 'Gastropub', 'Cocktail Bar', 'Beer Store', 'Whisky Bar', 'Wine Bar', 'Creperie', 
                  'Jazz Club', 'Salad Place', 'Pie Shop', 'Burrito Place', 'Speakeasy', 'Beer Garden', 'Gaming Cafe',
                  'Tiki Bar', 'General Entertainment', 'Bubble Tea Shop', 'Hot Dog Joint', 'Movie Theater', 'Roof Deck',
                  'Gay Bar', 'Sake Bar', 'Rest Area', 'Cafeteria', 'Hotel Bar', 'Club House', 'Recreation Center', 
                  'Comedy Club', 'Irish Pub', 'Social Club', 'Street Food Gathering', 'Beach Bar', 'Bath House',
                  'Strip Club', 'Spiritual Center']

# Touristic
ny_touristic = ['Hotel', 'Historic Site', 'Hostel', 'Scenic Lookout', 'Other Great Outdoors', 'Beach', 
                'Outdoors & Recreation', 'Waterfront', 'Church', 'Farm', 'Outdoor Sculpture', 'Monument / Landmark',
                'Theme Park Ride / Attraction', 'Surf Spot', 'Bike Trail', 'Resort', 'Memorial Site', 'Lighthouse',
                'Tourist Information Center', 'Theme Park']

# Parks & Transportation
ny_parks_and_transport = ['Bus Station', 'Park', 'Platform', 'Metro Station', 'River', 'Playground', 'Trail', 'Bus Stop',
                          'Train Station', 'Moving Target', 'Track', 'Harbor / Marina', 'Boat or Ferry', 'Intersection', 
                          'Bus Line', 'Airport Tram', 'Road', 'Building', 'Garden Center', 'Airport Terminal', 'Garden', 
                          'Dog Run', 'Lake', 'Sculpture Garden', 'Fountain', 'Pier', 'Tree', 'Train', 'Pedestrian Plaza',
                          'Field', 'State / Provincial Park', 'Heliport', 'Bridge', 'Campground']


ny_info = []
ny_info.append(["University Spots", len(ny_university)])
ny_info.append(["Services", len(ny_urban_services)])
ny_info.append(["Restaurants", len(ny_restaurants)])
ny_info.append(["Shops", len(ny_shopping)])
ny_info.append(["Cultural Spots", len(ny_culture)])
ny_info.append(["Sports Venues", len(ny_sports)])
ny_info.append(["Bars/Clubs", len(ny_getting_out)])
ny_info.append(["Touristic Sites", len(ny_touristic)])
ny_info.append(["Parks", len(ny_parks_and_transport)])

ny_venues_info = pd.DataFrame(ny_info, columns=["Category", "Unique Sub-Categories"])
ny_venues_info

Unnamed: 0,Category,Unique Sub-Categories
0,University Spots,7
1,Services,76
2,Restaurants,104
3,Shops,63
4,Cultural Spots,24
5,Sports Venues,29
6,Bars/Clubs,64
7,Touristic Sites,20
8,Parks,34


## Manually group Foursquare's venues categories found in Toronto

In [16]:
# to_venues['Venue Category'].unique()

# University
to_university = ['Fraternity House', 'College Auditorium', 'College Quad', 'College Rec Center', 'College Stadium', 
                 'Swim School', 'College Gym', 'College Arts Building']

# Services
to_urban_services = ['Animal Shelter', 'Bank', 'Health & Beauty Service', 'Office', 'Bakery', 'Fish Market', 
                     'Rental Car Location', 'Medical Center', 'Hockey Arena', 'Pharmacy', 'Paper / Office Supplies Store', 
                     'Tech Startup', 'Convenience Store', 'Neighborhood', 'Market', 'Business Service', 'Auto Dealership', 
                     'Housing Development', 'Salon / Barbershop', 'Gas Station', 'Massage Studio', 'Moving Target', 
                     'Building', 'Construction & Landscaping', 'Home Service', 'Empanada Restaurant', 'Auto Workshop', 
                     'Coworking Space', 'Butcher', 'Check Cashing Service', 'Community Center', 'Pilates Studio', 
                     'Chiropractor', 'Tanning Salon']

# Restaurant
to_restaurants = ['Fast Food Restaurant', 'Food & Drink Shop', 'Restaurant', 'Portuguese Restaurant', 'Pizza Place', 
                  'Chocolate Shop', 'Farmers Market', 'Dessert Shop', 'Mexican Restaurant', 'Mediterranean Restaurant',
                  'Italian Restaurant', 'French Restaurant', 'Greek Restaurant', 'Health Food Store', 
                  'Vietnamese Restaurant', 'Korean Restaurant', 'Sushi Restaurant', 'Gastropub', 'Creperie', 
                  'Persian Restaurant', 'Japanese Restaurant', 'Burrito Place', 'Ramen Restaurant', 'Burger Joint', 
                  'Seafood Restaurant', 'Diner', 'Chinese Restaurant', 'Ethiopian Restaurant', 'Wings Joint', 
                  'Sandwich Place', 'Middle Eastern Restaurant', 'Falafel Restaurant', 'Caribbean Restaurant', 'Taco Place', 
                  'Thai Restaurant', 'Steakhouse', 'American Restaurant', 'Vegetarian / Vegan Restaurant', 'Food Court',
                  'Modern European Restaurant', 'BBQ Joint', 'Asian Restaurant', 'Dim Sum Restaurant', 'Food Truck', 
                  'Breakfast Spot', 'New American Restaurant', 'Molecular Gastronomy Restaurant', 'Hawaiian Restaurant', 
                  'Latin American Restaurant', 'Indian Restaurant', 'Salad Place', 'Bagel Shop', 'German Restaurant', 
                  'Fried Chicken Joint', 'Belgian Restaurant', 'Comfort Food Restaurant', 'Tapas Restaurant', 
                  'Hakka Restaurant', 'Frozen Yogurt Shop', 'Afghan Restaurant', 'Brazilian Restaurant', 
                  'Gluten-free Restaurant', 'Poutine Place', 'Gourmet Shop', 'Cuban Restaurant', 'Malay Restaurant', 
                  'Mac & Cheese Joint', 'Southern / Soul Food Restaurant', 'Eastern European Restaurant', 'Food',
                  'Indonesian Restaurant', 'Cajun / Creole Restaurant', 'Jewish Restaurant', 'Tex-Mex Restaurant', 
                  'Churrascaria', 'Dumpling Restaurant', 'Arepa Restaurant', 'Doner Restaurant', 'Hotpot Restaurant', 
                  'Filipino Restaurant', 'Cantonese Restaurant', 'Taiwanese Restaurant']
 
# Shopping
to_shopping = ['Shoe Store', 'Cosmetics Shop', 'Brewery', 'Electronics Store', 'Liquor Store', 'Antique Shop', 'Boutique', 
               'Furniture / Home Store', 'Grocery Store', 'Accessories Store', "Women's Store", 'Clothing Store', 'Hobby Shop', 
               'Pet Store', 'Bubble Tea Shop', 'Arts & Crafts Store', 'Beer Store', 'Adult Boutique', 'Smoothie Shop',
               'Shopping Mall', 'Supermarket', 'Miscellaneous Shop', 'Department Store', 'Toy / Game Store', 'Bike Shop',
               'Sporting Goods Shop', 'Tailor Shop', 'Optical Shop', 'Fish & Chips Shop', 'Discount Store', 'Baby Store', 
               'Flower Shop', 'Bridal Shop', 'Video Store', 'Warehouse Store', 'Noodle House', "Men's Store", 
               'Fruit & Vegetable Store', 'Thrift / Vintage Store', 'Outdoor Supply Store', 'Stationery Store', 
               'Hardware Store', 'Turkish Restaurant', 'Flea Market', 'Shop & Service', 'Organic Grocery', 'Drugstore', 
               'Print Shop']

# Culture
to_culture = ['Performing Arts Venue', 'Event Space', 'Theater', 'Art Gallery', 'Bookstore', 'Mobile Phone Shop', 
              'Comic Shop', 'Concert Hall', 'Movie Theater', 'Music Venue', 'Jazz Club', 'Museum', 'Record Shop',
              'Arcade', 'Art Museum', 'Opera House', 'Candy Store', 'Video Game Store', 'Gift Shop', 'Jewelry Store', 
              'Supplement Shop', 'Dance Studio', 'History Museum', 'Amphitheater', 'Indie Movie Theater', 
              'Recording Studio', 'Library']

# Sports
to_sports = ['Gym / Fitness Center', 'Spa', 'Yoga Studio', 'Gym', 'Curling Ice', 'Skating Rink', 'Athletics & Sports', 
             'Rock Climbing Spot', 'Basketball Stadium', 'Soccer Field', 'Pool', 'Dog Run', 'Tennis Court', 
             'Basketball Court', 'Baseball Field', 'Baseball Stadium', 'Gym Pool', 'Climbing Gym', 'Stadium', 
             'Martial Arts Dojo']

# Getting Out
to_getting_out = ['Pub', 'Coffee Shop', 'Café', 'Ice Cream Shop',  'Bowling Alley', 'Nightclub', 'General Entertainment', 
                  'Bar', 'College Cafeteria', 'Plaza', 'Tea Room', 'Beer Bar', 'Juice Bar', 'Lounge', 'Hookah Bar', 
                  'Wine Bar', 'Cocktail Bar', 'Bistro', 'Sports Bar', 'Deli / Bodega', 'Irish Pub', 'Donut Shop', 
                  'Speakeasy', 'Smoke Shop', 'Cupcake Shop', 'Theme Park Ride / Attraction', 'Theme Park', 'Dive Bar', 'Snack Place',
                  'Piano Bar', 'Gaming Cafe', 'Strip Club', 'Skate Park', 'Gay Bar', 'Sake Bar']

# Touristic
to_touristic = ['Historic Site', 'Hotel', 'Church', 'Hostel', 'Monument / Landmark', 'Hotel Bar', 'General Travel', 
                'Aquarium']

# Parks & Transportation
to_parks_and_transport = ['Park', 'Intersection', 'Tram Station', 'Metro Station', 'Field', 'Trail', 'Beach', 'Fountain', 
                          'Train Station', 'Playground', 'Golf Course', 'Bus Station', 'Bus Line', 'Lake', 'Scenic Lookout',
                          'Light Rail Station', 'Airport', 'Bus Stop', 'Garden', 'Garden Center', 'Other Great Outdoors',
                          'Airport Lounge', 'Harbor / Marina', 'Airport Food Court', 'Airport Terminal', 'Airport Gate', 
                          'Plane', 'Airport Service', 'Sculpture Garden', 'Boat or Ferry', 'Pier', 'Farm', 'River']

to_info = []
to_info.append(["University Spots", len(to_university)])
to_info.append(["Services", len(to_urban_services)])
to_info.append(["Restaurants", len(to_restaurants)])
to_info.append(["Shops", len(to_shopping)])
to_info.append(["Cultural Spots", len(to_culture)])
to_info.append(["Sports Venues", len(to_sports)])
to_info.append(["Bars/Clubs", len(to_getting_out)])
to_info.append(["Touristic Sites", len(to_touristic)])
to_info.append(["Parks", len(to_parks_and_transport)])

to_venues_info = pd.DataFrame(to_info, columns=["Category", "Unique Sub-Categories"])
to_venues_info

Unnamed: 0,Category,Unique Sub-Categories
0,University Spots,8
1,Services,34
2,Restaurants,82
3,Shops,48
4,Cultural Spots,27
5,Sports Venues,20
6,Bars/Clubs,35
7,Touristic Sites,8
8,Parks,33


## Plot the number of unique sub-categories found for each "larger" category defined

In [17]:
trace1 = go.Bar(x=to_venues_info['Category'],
                y=to_venues_info['Unique Sub-Categories'],
                opacity=0.3,
                name="Unique Sub-Categories in Toronto")
trace2 = go.Bar(x=ny_venues_info['Category'],
                y=ny_venues_info['Unique Sub-Categories'],
                opacity=0.3,
                name="Unique Sub-Categories in New York City")

data = [trace1, trace2]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

py.plotly.iplot(fig)

## Encode the venue data (preparation for statistical analysis)

### Encoder Function

In [38]:
def encode_venues_categories(dataframe):
    res = []
    for index, row in dataframe.iterrows():
        if row["Venue Category"] in to_university:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        1, 0, 0, 0, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in to_urban_services:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in to_restaurants:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 1, 0, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in to_shopping:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 1, 0, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in to_culture:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 1, 0, 0, 0, 0, 1])
        elif row["Venue Category"] in to_sports:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 1, 0, 0, 0, 1])
        elif row["Venue Category"] in to_getting_out:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 0, 1, 0, 0, 1])
        elif row["Venue Category"] in to_getting_out:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 0, 0, 1, 0, 1])
        elif row["Venue Category"] in to_parks_and_transport:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
        else:
            res.append([row["Neighborhood"], row["Neighborhood Latitude"], row["Neighborhood Longitude"],
                        row["Venue Latitude"], row["Venue Longitude"],
                        0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
    return res

### Create encoded dataframes for Toronto

In [105]:
# Create encoded venues dataframe
to_encoded_venues = pd.DataFrame(encode_venues_categories(to_venues), 
                                 columns=["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", 
                                          "Venue Latitude", "Venue Longitude", 
                                          "University Spots", "Services", "Restaurants",
                                          "Shops", "Cultural Spots", "Sports Venues",
                                          "Bars/Clubs", "Touristic Sites", "Parks", "Total Venues"])

# Create encoded grouped venues dataframe
to_encoded_grouped_venues = to_encoded_venues.groupby(['Neighborhood', 
                                                       'Neighborhood Latitude', 
                                                       'Neighborhood Longitude']).sum().sort_values(by=['Total Venues']).reset_index()
# Save Neighborhood column for later
to_encoded_grouped_venues_Neighborhood = to_encoded_grouped_venues['Neighborhood']
# Drop non-integer columns
to_encoded_grouped_venues = to_encoded_grouped_venues.drop(['Neighborhood Latitude', 
                                                            'Neighborhood Longitude',
                                                            'Venue Latitude',
                                                            'Venue Longitude',
                                                            'Neighborhood'], axis=1)

# Prepare encoded grouped venues dataframe for KMeans clustering
to_encoded_grouped_venues_std = to_encoded_venues.groupby(['Neighborhood', 
                                                           'Neighborhood Latitude', 
                                                           'Neighborhood Longitude']).mean().sort_values(by=['Total Venues']).reset_index()
# Save columns for later
to_encoded_grouped_venues_std_Neighborhood = to_encoded_grouped_venues_std['Neighborhood']
to_encoded_grouped_venues_std_Latitude = to_encoded_grouped_venues_std['Neighborhood Latitude']
to_encoded_grouped_venues_std_Longitude = to_encoded_grouped_venues_std['Neighborhood Longitude']
# Drop non-integer columns
to_encoded_grouped_venues_std = to_encoded_grouped_venues_std.drop(['Neighborhood Latitude', 
                                                                    'Neighborhood Longitude',
                                                                    'Venue Latitude',
                                                                    'Venue Longitude',
                                                                    'Neighborhood',
                                                                    'Total Venues'], axis=1)

to_encoded_grouped_venues_std.tail()

Unnamed: 0,University Spots,Services,Restaurants,Shops,Cultural Spots,Sports Venues,Bars/Clubs,Touristic Sites,Parks
205,0.0,0.1,0.4,0.4,0.0,0.0,0.1,0.0,0.0
206,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0
207,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.75
208,0.0,0.095238,0.253968,0.380952,0.095238,0.015873,0.111111,0.0,0.047619
209,0.0,0.0,0.5,0.083333,0.041667,0.041667,0.291667,0.0,0.041667


### Create encoded dataframes for New York City

In [106]:
# Create encoded venues dataframe
ny_encoded_venues = pd.DataFrame(encode_venues_categories(ny_venues), 
                                 columns=["Neighborhood", "Neighborhood Latitude", "Neighborhood Longitude", 
                                          "Venue Latitude", "Venue Longitude", 
                                          "University Spots", "Services", "Restaurants",
                                          "Shops", "Cultural Spots", "Sports Venues",
                                          "Bars/Clubs", "Touristic Sites", "Parks", "Total Venues"])

# Create encoded grouped venues dataframe
ny_encoded_grouped_venues = ny_encoded_venues.groupby(['Neighborhood', 
                                                       'Neighborhood Latitude', 
                                                       'Neighborhood Longitude']).sum().sort_values(by=['Total Venues']).reset_index()
# Save Neighborhood column for later
ny_encoded_grouped_venues_Neighborhood = ny_encoded_grouped_venues['Neighborhood']
# Drop non-integer columns
ny_encoded_grouped_venues = ny_encoded_grouped_venues.drop(['Neighborhood Latitude', 
                                                            'Neighborhood Longitude',
                                                            'Venue Latitude',
                                                            'Venue Longitude',
                                                            'Neighborhood'], axis=1)

# Prepare encoded grouped venues dataframe for KMeans clustering
ny_encoded_grouped_venues_std = ny_encoded_venues.groupby(['Neighborhood', 
                                                           'Neighborhood Latitude', 
                                                           'Neighborhood Longitude']).mean().sort_values(by=['Total Venues']).reset_index()
# Save Neighborhood column for later
ny_encoded_grouped_venues_std_Neighborhood = ny_encoded_grouped_venues_std['Neighborhood']
ny_encoded_grouped_venues_std_Latitude = ny_encoded_grouped_venues_std['Neighborhood Latitude']
ny_encoded_grouped_venues_std_Longitude = ny_encoded_grouped_venues_std['Neighborhood Longitude']
# Drop non-integer columns
ny_encoded_grouped_venues_std = ny_encoded_grouped_venues_std.drop(['Neighborhood Latitude', 
                                                                    'Neighborhood Longitude',
                                                                    'Venue Latitude',
                                                                    'Venue Longitude',
                                                                    'Neighborhood',
                                                                    'Total Venues'], axis=1)

ny_encoded_grouped_venues_std.tail()

Unnamed: 0,University Spots,Services,Restaurants,Shops,Cultural Spots,Sports Venues,Bars/Clubs,Touristic Sites,Parks
301,0.0,0.19,0.39,0.15,0.03,0.15,0.09,0.0,0.0
302,0.0,0.25,0.375,0.0,0.0,0.0,0.291667,0.0,0.083333
303,0.0,0.15,0.4,0.04,0.06,0.07,0.25,0.0,0.03
304,0.0,0.189189,0.27027,0.108108,0.081081,0.216216,0.081081,0.0,0.054054
305,0.0,0.14,0.41,0.04,0.0,0.12,0.27,0.0,0.02


## Visualizing the numbers of venues extracted for each city

### Venues bar chart for Toronto

In [73]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = to_encoded_venues['Neighborhood'].value_counts()#[:20]
series.head(3)

series.iplot(kind='bar', yTitle='Number of Venues', title='Toronto numbers of venues per neighborhood',
             filename='toronto-bar-chart')

### Venues bar chart for New York City

In [74]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = ny_encoded_venues['Neighborhood'].value_counts()#[:20]
series.head(3)

series.iplot(kind='bar', yTitle='Number of Venues', title='New York City numbers of venues per neighborhood',
             filename='newyork-bar-chart')

## Number of Restaurants and Services Analysis

### Distribution of neighborhoods' numbers of restaurants in Toronto

In [75]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = to_encoded_grouped_venues['Restaurants'].value_counts()
series.head(3)

series.iplot(kind='bar', xTitle='Number of Restaurants', yTitle='Number of Neighborhoods', 
             title='Number of Neighborhoods with X number of Restaurants in Toronto',
             filename='toronto_rest-bar-chart')

### Distribution of neighborhoods' numbers of restaurants in New York City

In [76]:
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

series = ny_encoded_grouped_venues['Restaurants'].value_counts()
series.head(3)

series.iplot(kind='bar', xTitle='Number of Restaurants', yTitle='Number of Neighborhoods', 
             title='Number of Neighborhoods with X number of Restaurants in New York City',
             filename='newyork_rest-bar-chart')

## K-Means Clustering of Neighborhoods

### Clustering neighborhoods in New York City

In [107]:
# Copy the encoded grouped venues standarized dataframe
ny_clustered_neighborhoods = ny_encoded_grouped_venues_std

# Columns list
clmns = ["University Spots", "Services", "Restaurants", "Shops", "Cultural Spots", 
         "Sports Venues", "Bars/Clubs", "Touristic Sites", "Parks"]
    
# Cluster the data
kmeans = KMeans(n_clusters=5, random_state=0).fit(ny_clustered_neighborhoods)
labels = kmeans.labels_

# Make the new Cluster column
ny_clustered_neighborhoods['Cluster'] = labels

# Add the column into our list
clmns.extend(['Cluster'])

# Lets analyze the clusters
ny_clustered_neighborhoods[clmns].groupby(['Cluster']).mean()

Unnamed: 0_level_0,University Spots,Services,Restaurants,Shops,Cultural Spots,Sports Venues,Bars/Clubs,Touristic Sites,Parks
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.0,0.165218,0.284665,0.076081,0.036673,0.041317,0.122817,0.0,0.273229
1,0.0,0.173898,0.163351,0.081668,0.04717,0.070389,0.377861,0.0,0.085663
2,0.0,0.139261,0.48494,0.086671,0.026498,0.038884,0.176079,0.0,0.047666
3,0.0,0.183295,0.312262,0.206631,0.054043,0.069833,0.128569,0.0,0.045367
4,0.0,0.079153,0.030782,0.040306,0.026316,0.13254,0.065822,0.0,0.625082


### Clustering neighborhoods in Toronto

In [108]:
# Copy the encoded grouped venues standarized dataframe
to_clustered_neighborhoods = to_encoded_grouped_venues_std

# Columns list
clmns = ["University Spots", "Services", "Restaurants", "Shops", "Cultural Spots", 
         "Sports Venues", "Bars/Clubs", "Touristic Sites", "Parks"]
    
# Cluster the data
kmeans = KMeans(n_clusters=5, random_state=0).fit(to_clustered_neighborhoods)
labels = kmeans.labels_

# Make the new Cluster column
to_clustered_neighborhoods['Cluster'] = labels

# Add the column into our list
clmns.extend(['Cluster'])

# Lets analyze the clusters
to_clustered_neighborhoods[clmns].groupby(['Cluster']).mean()

Unnamed: 0_level_0,University Spots,Services,Restaurants,Shops,Cultural Spots,Sports Venues,Bars/Clubs,Touristic Sites,Parks
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.0,0.162718,0.210629,0.377248,0.021233,0.037733,0.119453,0.0,0.070985
1,0.008782,0.084637,0.386003,0.096959,0.052746,0.062976,0.280308,0.0,0.027589
2,0.0,0.146503,0.591525,0.100051,0.063351,0.045311,0.017524,0.0,0.035735
3,0.008065,0.035484,0.020968,0.0,0.012903,0.066129,0.041935,0.0,0.814516
4,0.0,0.586667,0.0,0.0,0.0,0.176667,0.073333,0.0,0.163333


## Geographical visualization of clustered neighborhoods in New York City

### New map rendering function

In [109]:
def generate_map_of_city_clustered_neighborhoods(city_name, city_neighborhoods, kclusters):
    
    # Find city geographical coordinates using geocode google API
    geolocator = Nominatim(user_agent="my_jupyter_notebook")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    city_location = geolocator.geocode(city_name) #'New York City, NY'
    city_latitude = city_location.latitude
    city_longitude = city_location.longitude
    print('The geographical coordinates of "{}" are {}, {}.'.format(city_name, city_latitude, city_longitude))
    
    # Check number of Boroughs and Neighborhoods in the collected Dataset
    print('The "{}" dataframe has {} clusters and {} neighborhoods.'.format(
          city_name,
          kclusters,
          len(city_neighborhoods['Neighborhood'].unique())))
    
    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i+x+(i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]
    
    # create map of city using latitude and longitude values
    map_city = folium.Map(location=[city_latitude, city_longitude], zoom_start=10)

    # add markers to map
    for lat, lng, neighborhood, cluster in zip(city_neighborhoods['Neighborhood Latitude'], 
                                               city_neighborhoods['Neighborhood Longitude'], 
                                               city_neighborhoods['Neighborhood'], 
                                               city_neighborhoods['Cluster']):
        label = folium.Popup(str(neighborhood)+', Cluster: '+str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_city)  

    return map_city

### Data preparation

In [110]:
# Add the neighborhoods columns back to the clustered dataframe
ny_clustered_neighborhoods['Neighborhood'] = ny_encoded_grouped_venues_std_Neighborhood
ny_clustered_neighborhoods['Neighborhood Latitude'] = ny_encoded_grouped_venues_std_Latitude
ny_clustered_neighborhoods['Neighborhood Longitude'] = ny_encoded_grouped_venues_std_Longitude

# Drop the venues columns from the clustered dataframe
ny_clustered_neighborhoods = ny_clustered_neighborhoods.drop(['University Spots', 
                                                              'Services',
                                                              'Restaurants',
                                                              'Shops',
                                                              'Cultural Spots',
                                                              'Sports Venues',
                                                              'Bars/Clubs',
                                                              'Touristic Sites',
                                                              'Parks'], axis=1)
ny_clustered_neighborhoods.tail()

Unnamed: 0,Cluster,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
301,3,Flatiron,40.739673,-73.990947
302,2,Flatbush,40.636326,-73.958401
303,2,Financial District,40.707107,-74.010665
304,3,Forest Hills,40.725264,-73.844475
305,2,Yorkville,40.77593,-73.947118


### New York City Map

In [111]:
generate_map_of_city_clustered_neighborhoods('New York City, NY', ny_clustered_neighborhoods, 5)

The geographical coordinates of "New York City, NY" are 40.7308619, -73.9871558.
The "New York City, NY" dataframe has 5 clusters and 302 neighborhoods.


## Geographical visualization of clustered neighborhoods in Toronto

### Data preparation

In [112]:
# Add the neighborhoods columns back to the clustered dataframe
to_clustered_neighborhoods['Neighborhood'] = to_encoded_grouped_venues_std_Neighborhood
to_clustered_neighborhoods['Neighborhood Latitude'] = to_encoded_grouped_venues_std_Latitude
to_clustered_neighborhoods['Neighborhood Longitude'] = to_encoded_grouped_venues_std_Longitude

# Drop the venues columns from the clustered dataframe
to_clustered_neighborhoods = to_clustered_neighborhoods.drop(['University Spots', 
                                                              'Services',
                                                              'Restaurants',
                                                              'Shops',
                                                              'Cultural Spots',
                                                              'Sports Venues',
                                                              'Bars/Clubs',
                                                              'Touristic Sites',
                                                              'Parks'], axis=1)
to_clustered_neighborhoods.tail()

Unnamed: 0,Cluster,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
205,0,Humbergate,43.739416,-79.588437
206,4,Humberlea,43.724766,-79.532242
207,3,Humewood-Cedarvale,43.693781,-79.428191
208,0,Fairview,43.778517,-79.346556
209,1,Yorkville,43.67271,-79.405678


### Toronto Map

In [114]:
generate_map_of_city_clustered_neighborhoods('Toronto, ON', to_clustered_neighborhoods, 5)

The geographical coordinates of "Toronto, ON" are 43.653963, -79.387207.
The "Toronto, ON" dataframe has 5 clusters and 208 neighborhoods.


## List Neighborhoods in each cluster

In [115]:
to_clustered_neighborhoods.loc[to_clustered_neighborhoods['Cluster'] == 0, to_clustered_neighborhoods.columns[[1] + list(range(2, to_clustered_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
16,Runnymede,43.673185,-79.487262
22,Silverstone,43.739416,-79.588437
34,Mount Olive,43.739416,-79.588437
39,North Toronto West,43.715383,-79.405678
40,Northwest,43.706748,-79.594054
41,Northwood Park,43.76798,-79.487262
43,Old Burnhamthorpe,43.643515,-79.577201
46,Oriole,43.778517,-79.346556
52,South Steeles,43.739416,-79.588437
68,Willowdale West,43.782736,-79.442259


In [116]:
to_clustered_neighborhoods.loc[to_clustered_neighborhoods['Cluster'] == 1, to_clustered_neighborhoods.columns[[1] + list(range(2, to_clustered_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
0,Adelaide,43.650571,-79.384568
1,Queen's Park,43.662301,-79.389494
3,Rathnelly,43.686412,-79.400049
4,Regent Park,43.65426,-79.360636
5,Richmond,43.650571,-79.384568
7,Riverdale,43.679557,-79.352188
8,Roncesvalles,43.64896,-79.456325
15,Runnymede,43.651571,-79.48445
17,Ryerson,43.657162,-79.378937
29,Mimico South,43.605647,-79.501321


In [117]:
to_clustered_neighborhoods.loc[to_clustered_neighborhoods['Cluster'] == 2, to_clustered_neighborhoods.columns[[1] + list(range(2, to_clustered_neighborhoods.shape[1]))]]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude
6,Richview Gardens,43.688905,-79.554724
12,Rouge,43.806686,-79.194353
14,Royal York South West,43.628841,-79.520999
18,Scarborough Town Centre,43.75741,-79.273304
20,Scarborough Village West,43.716316,-79.239476
24,Silverthorn,43.691116,-79.476013
26,Parkview Hill,43.706397,-79.309937
28,Mimico NW,43.628841,-79.520999
32,Morningside,43.763573,-79.188711
33,Mount Dennis,43.691116,-79.476013
