# Comparing Neighborhoods of New York City and Toronto

In [1]:
import pandas as pd
import numpy as np
import json
import requests
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

In [2]:
# Toronto
toronto = pd.read_csv("Geospatial_Coordinates.csv")

# New York
newyork = pd.read_excel("new_york.xlsx", index_col = [0])
manhattan = newyork[newyork.Borough == "Manhattan"]

In [3]:
manhattan.shape

(40, 4)

### Foursquare login

In [4]:
CLIENT_ID = 'JIAUONUDO3PFY2S4E2B3EBNWOXUYCERA5SOWLIG2PH1ZTQOU' # your Foursquare ID
CLIENT_SECRET = 'GZF303CAGHNQXBJQP5DUSD3JMRGSBBE5OGMD0RL5QQJNYLY1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Get venue data

In [6]:
# Merging Toronto and New York dataset
_ = newyork.drop(columns = "Borough")
_["City"] = "Manhattan"
tor = pd.read_excel("toronto.xlsx", index_col = [0])
tor.drop(columns = ["PostalCode", "Borough"], inplace = True)
tor.rename(columns = {"Neighbourhood":"Neighborhood"}, inplace = True)
tor["City"] = "Toronto"
torman = pd.concat([_, tor])

In [8]:
torman

Unnamed: 0,Neighborhood,Latitude,Longitude,City
0,Wakefield,40.894705,-73.847201,Manhattan
1,Co-op City,40.874294,-73.829939,Manhattan
2,Eastchester,40.887556,-73.827806,Manhattan
3,Fieldston,40.895437,-73.905643,Manhattan
4,Riverdale,40.890834,-73.912585,Manhattan
...,...,...,...,...
98,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,Toronto
99,Church and Wellesley,43.665860,-79.383160,Toronto
100,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Toronto
101,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,Toronto


In [9]:
# Get all venues in manhatten
torman_venues = getNearbyVenues(names = torman['Neighborhood'],
                                   latitudes = torman['Latitude'],
                                   longitudes = torman['Longitude']
                                  )

# One hot encoding
torman_onehot = pd.get_dummies(torman_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
torman_onehot['Neighborhood'] = torman_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [torman_onehot.columns[-1]] + list(torman_onehot.columns[:-1])
torman_onehot = torman_onehot[fixed_columns]

# Group by neighborhood
torman_grouped = torman_onehot.groupby(by = "Neighborhood").mean().reset_index()

KeyError: 'groups'

## Clustering

In [None]:
# set number of clusters
k = 10
torman_grouped_ = torman_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = k, random_state = 0).fit(torman_grouped_)

# add clustering labels
torman_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

torman_merged = torman

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
torman_merged = torman_merged.join(torman_grouped.set_index('Neighborhood'),
                                         on='Neighborhood')

#### I want to find similar neighborhoods to Somerville, Manhattan in Toronto where the number of stores for women is a key indicator:

In [None]:
def find_similar_nbs(neighborhood, city, sort_values_by):
    cluster = torman_merged[torman_merged.loc[:, "Neighborhood"] == neighborhood]["Cluster Labels"].item()
    not_city =  torman_merged[torman_merged.loc[:, "City"] != city]
    similars = not_city.loc[not_city.loc[:, "Cluster Labels"] == cluster].reset_index(drop = True)
    similars_sorted = similars.sort_values(by = "Yoga Studio")
    return similars_sorted, similars_sorted.Neighborhood

In [None]:
map_similar = find_similar_nbs("Somerville", "Manhattan", "Women's Store")[0]
find_similar_nbs("Somerville", "Manhattan", "Women's Store")[1]

In [None]:
# Create map of New York using latitude and longitude values
map_toronto_similar = folium.Map(location=[map_similar.Latitude[0], map_similar.Longitude[0]], zoom_start = 10)

# Add markers to map
for lat, lng, neighborhood in zip(map_similar['Latitude'],
                                   map_similar['Longitude'],
                                   map_similar['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_similar)  
    
map_toronto_similar