# Capstone Project - The Battle of Neighborhoods (Week 2)

### Importing dependencies, handling json data

In [10]:
import pandas as pd
import json
import requests
from pandas.io.json import json_normalize
with open("nyu_2451_34572-geojson.json") as json_data:
    data = json.load(json_data)
data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

### For accessing Four Square servies

In [11]:
CLIENT_ID = 'MUNQZIEFBUUAGXO1NYVBPBGJ2AMEKOIT0K0DDL421J3MVIBA'
CLIENT_SECRET = 'LJSVXREDHNZAOLI0PGJITSVOV3QK555243IOAHTKM3XHKJEA' 
VERSION = '20180605' # Foursquare API version

In [12]:
neighborhoods_data = data['features']
neighborhoods_data

[{'type': 'Feature',
  'id': 'nyu_2451_34572.1',
  'geometry': {'type': 'Point',
   'coordinates': [-73.84720052054902, 40.89470517661]},
  'geometry_name': 'geom',
  'properties': {'name': 'Wakefield',
   'stacked': 1,
   'annoline1': 'Wakefield',
   'annoline2': None,
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.84720052054902,
    40.89470517661,
    -73.84720052054902,
    40.89470517661]}},
 {'type': 'Feature',
  'id': 'nyu_2451_34572.2',
  'geometry': {'type': 'Point',
   'coordinates': [-73.82993910812398, 40.87429419303012]},
  'geometry_name': 'geom',
  'properties': {'name': 'Co-op City',
   'stacked': 2,
   'annoline1': 'Co-op',
   'annoline2': 'City',
   'annoline3': None,
   'annoangle': 0.0,
   'borough': 'Bronx',
   'bbox': [-73.82993910812398,
    40.87429419303012,
    -73.82993910812398,
    40.87429419303012]}},
 {'type': 'Feature',
  'id': 'nyu_2451_34572.3',
  'geometry': {'type': 'Point',
   'coordinates': [-73.82780644716412, 

In [13]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)

In [14]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [15]:
for dat in neighborhoods_data:
    borough = neighborhood_name = dat['properties']['borough'] 
    neighborhood_name = dat['properties']['name']
        
    neighborhood_latlon = dat['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

### Neighborhoods data from json file to Dataframe

In [16]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [17]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [18]:
from geopy.geocoders import Nominatim
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


### Map of NYC with neighborhoods using Folium

In [19]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Exploring the neighborhoods in our dataframe.


Now, let's get the top 100 venues that are in NYC within a radius of 500 meters.

In [20]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=MUNQZIEFBUUAGXO1NYVBPBGJ2AMEKOIT0K0DDL421J3MVIBA&client_secret=LJSVXREDHNZAOLI0PGJITSVOV3QK555243IOAHTKM3XHKJEA&v=20180605&ll=40.7127281,-74.0060152&radius=500&limit=100'

#### Function to explore all the neighborhoods in NYC


In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
ny_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [24]:
health_categories=['Pharmacy','Yoga Studio','Medical Center']

In [25]:
ny_venues=ny_venues[ny_venues["Venue Category"].isin(health_categories)]
    

## Dataframe with Venue Category - 'Pharmacy','Yoga Studio','Medical Center'


In [26]:
ny_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
2,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.844700,Pharmacy
3,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
11,Co-op City,40.874294,-73.829939,Rite Aid,40.870345,-73.828302,Pharmacy
111,Kingsbridge,40.881687,-73.902818,Rite Aid,40.885481,-73.900814,Pharmacy
112,Kingsbridge,40.881687,-73.902818,Walgreens,40.878538,-73.904780,Pharmacy
...,...,...,...,...,...,...,...
9579,Prince's Bay,40.526264,-74.201526,CVS pharmacy,40.525814,-74.201656,Pharmacy
9652,Allerton,40.865788,-73.859319,Rite Aid,40.865949,-73.860922,Pharmacy
9698,Kingsbridge Heights,40.870392,-73.901523,Duane Reade,40.867540,-73.896984,Pharmacy
9715,Erasmus,40.646926,-73.948177,The Yoga Studio,40.650000,-73.950000,Yoga Studio


In [27]:
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

ny_onehot.head()

Unnamed: 0,Neighborhood,Medical Center,Pharmacy,Yoga Studio
2,Wakefield,0,1,0
3,Wakefield,0,1,0
11,Co-op City,0,1,0
111,Kingsbridge,0,1,0
112,Kingsbridge,0,1,0


In [28]:
ny_onehot.shape

(247, 4)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [29]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
ny_grouped

Unnamed: 0,Neighborhood,Medical Center,Pharmacy,Yoga Studio
0,Allerton,0.0,1.0,0.0
1,Annadale,0.0,1.0,0.0
2,Arden Heights,0.0,1.0,0.0
3,Auburndale,0.0,1.0,0.0
4,Bath Beach,0.0,1.0,0.0
...,...,...,...,...
148,Woodhaven,0.0,1.0,0.0
149,Woodlawn,0.0,1.0,0.0
150,Woodrow,0.0,1.0,0.0
151,Woodside,0.0,1.0,0.0


In [30]:
ny_grouped.shape

(153, 4)

### Function to sort the venues in descending order.


In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
import numpy as np
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape

(153, 4)

In [33]:
neighborhoods_venues_sorted.head(5)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Allerton,Pharmacy,Yoga Studio,Medical Center
1,Annadale,Pharmacy,Yoga Studio,Medical Center
2,Arden Heights,Pharmacy,Yoga Studio,Medical Center
3,Auburndale,Pharmacy,Yoga Studio,Medical Center
4,Bath Beach,Pharmacy,Yoga Studio,Medical Center


## KMeans Clustering

In [34]:
# set number of clusters
from sklearn.cluster import KMeans
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 4, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 2, 0, 1,
       4, 0, 1, 0, 2, 0])

In [35]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


In [36]:
final=neighborhoods[["Neighborhood","Latitude","Longitude"]]

### Create a new dataframe that includes the cluster as well as the top 3venues for each neighborhood.

In [37]:
ny_merged = final

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
ny_merged.reset_index(drop=True)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Wakefield,40.894705,-73.847201,0.0,Pharmacy,Yoga Studio,Medical Center
1,Co-op City,40.874294,-73.829939,0.0,Pharmacy,Yoga Studio,Medical Center
2,Eastchester,40.887556,-73.827806,,,,
3,Fieldston,40.895437,-73.905643,,,,
4,Riverdale,40.890834,-73.912585,,,,
...,...,...,...,...,...,...,...
301,Hudson Yards,40.756658,-74.000111,,,,
302,Hammels,40.587338,-73.805530,,,,
303,Bayswater,40.611322,-73.765968,,,,
304,Queensbridge,40.756091,-73.945631,,,,


In [38]:
ny_merged=ny_merged.dropna()
ny_merged["Cluster Labels"]=ny_merged["Cluster Labels"].astype("int")
ny_merged.reset_index(drop=True)

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Wakefield,40.894705,-73.847201,0,Pharmacy,Yoga Studio,Medical Center
1,Co-op City,40.874294,-73.829939,0,Pharmacy,Yoga Studio,Medical Center
2,Kingsbridge,40.881687,-73.902818,0,Pharmacy,Yoga Studio,Medical Center
3,Marble Hill,40.876551,-73.910660,2,Yoga Studio,Pharmacy,Medical Center
4,Woodlawn,40.898273,-73.867315,0,Pharmacy,Yoga Studio,Medical Center
...,...,...,...,...,...,...,...
151,Middle Village,40.716415,-73.881143,0,Pharmacy,Yoga Studio,Medical Center
152,Prince's Bay,40.526264,-74.201526,0,Pharmacy,Yoga Studio,Medical Center
153,Allerton,40.865788,-73.859319,0,Pharmacy,Yoga Studio,Medical Center
154,Kingsbridge Heights,40.870392,-73.901523,0,Pharmacy,Yoga Studio,Medical Center


In [39]:
ny_merged.dtypes

Neighborhood              object
Latitude                 float64
Longitude                float64
Cluster Labels             int32
1st Most Common Venue     object
2nd Most Common Venue     object
3rd Most Common Venue     object
dtype: object

### Visualizing clusters


In [40]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1

In [41]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[0] + list(range(3, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Wakefield,0,Pharmacy,Yoga Studio,Medical Center
1,Co-op City,0,Pharmacy,Yoga Studio,Medical Center
5,Kingsbridge,0,Pharmacy,Yoga Studio,Medical Center
7,Woodlawn,0,Pharmacy,Yoga Studio,Medical Center
8,Norwood,0,Pharmacy,Yoga Studio,Medical Center
...,...,...,...,...,...
289,Homecrest,0,Pharmacy,Yoga Studio,Medical Center
290,Middle Village,0,Pharmacy,Yoga Studio,Medical Center
291,Prince's Bay,0,Pharmacy,Yoga Studio,Medical Center
298,Allerton,0,Pharmacy,Yoga Studio,Medical Center


### Cluster 2

In [42]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[0] + list(range(3, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
49,Greenpoint,1,Yoga Studio,Pharmacy,Medical Center
52,Sheepshead Bay,1,Yoga Studio,Pharmacy,Medical Center
59,Prospect Heights,1,Yoga Studio,Pharmacy,Medical Center
61,Williamsburg,1,Yoga Studio,Pharmacy,Medical Center
65,Cobble Hill,1,Yoga Studio,Pharmacy,Medical Center
68,Gowanus,1,Yoga Studio,Pharmacy,Medical Center
69,Fort Greene,1,Yoga Studio,Pharmacy,Medical Center
70,Park Slope,1,Yoga Studio,Pharmacy,Medical Center
84,Clinton Hill,1,Yoga Studio,Pharmacy,Medical Center
87,Boerum Hill,1,Yoga Studio,Pharmacy,Medical Center


### Cluster 3

In [43]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[0] + list(range(3, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
6,Marble Hill,2,Yoga Studio,Pharmacy,Medical Center
86,Downtown,2,Yoga Studio,Pharmacy,Medical Center
102,Inwood,2,Yoga Studio,Pharmacy,Medical Center
112,Lincoln Square,2,Yoga Studio,Pharmacy,Medical Center
114,Midtown,2,Yoga Studio,Pharmacy,Medical Center
119,Lower East Side,2,Yoga Studio,Pharmacy,Medical Center
126,Gramercy,2,Yoga Studio,Pharmacy,Medical Center
135,Forest Hills,2,Yoga Studio,Pharmacy,Medical Center
151,Bayside,2,Yoga Studio,Pharmacy,Medical Center
221,Ditmas Park,2,Yoga Studio,Pharmacy,Medical Center


### Cluster 4

In [44]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[0] + list(range(3, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
30,Parkchester,3,Medical Center,Yoga Studio,Pharmacy


### Cluster 5

In [45]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[0] + list(range(3, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
64,Brooklyn Heights,4,Yoga Studio,Pharmacy,Medical Center
161,Oakland Gardens,4,Yoga Studio,Pharmacy,Medical Center
276,Flatiron,4,Yoga Studio,Pharmacy,Medical Center
