# Exploratory Data Analysis 

## Import Libraries

In [None]:

from geopy.geocoders import Nominatim 

import requests 
import pandas as pd
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 
import json

print('Libraries imported.')

## 1. Download and Explore Dataset

In [None]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [None]:
neighborhood_data = newyork_data['features']

In [None]:
col_names = ['Borough','Neighborhood','Latitude','Longitude']

neighborhoods = pd.DataFrame(columns=col_names)

In [None]:
for data in neighborhood_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [None]:
neighborhoods.head()

We have 5 Boroughs that we will identify how many gyms are within their proximity a radius of 6000 meters first.

Using the **Geopy** module we will map all of New York City and the points of New York

In [None]:
address = "New York City, NY"

geolocator = Nominatim(user_agent = "ny_explorer")
location   = geolocator.geocode(address)
latitude   = location.latitude
longitude  = location.longitude

print("The geographical coordinates of New York City: {},{}".format(latitude,longitude))

We create the map using folium and then superimpose markers of the different neighborhoods


In [None]:
map_ny = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'],neighborhoods['Longitude'],neighborhoods['Borough'],neighborhoods['Neighborhood']):
    label = '{},{}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_ny)
map_ny

## 2. Do something very similar with gym data

### Define the Credentials for Foursquare

In [None]:
CLIENT_ID = 'VSP0Z54PLZLY2VG4QN4KNCM5OZVD35U1CRQ2ARSAUZUNF3H2' # your Foursquare ID
CLIENT_SECRET = 'WDF4G0FDAEBHYKDM3J35TE1JDFXQJTGGOR1WYGPYH25OPAZM' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### Manhattan Gyms

In [None]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for Manhattan: {},{}".format(latitude,longitude))

We then create a request for the data 

In [None]:
LIMIT = 100
radius = 6000
search_query = "Fitness"

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results =requests.get(url).json()
results

Using the **JSON_NORMALIZE** function turn the JSON into a dataframe

In [None]:
man_gyms = results['response']['venues']
ratings = []
man_gym  =json_normalize(man_gyms)
man_gym.shape

Clean the data by renaming columns and removing uncessary columns

In [None]:
man_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Neighbhorhood","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
man_gym = man_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]


## Drop the two indices in Jersey City
man_gym.drop(man_gym.index[16],inplace = True)
man_gym.drop(man_gym.index[28],inplace = True)
man_gym.reset_index(inplace=True)
man_gym

man_gym

In [None]:
map_man = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(man_gym['Latitude'],man_gym['Longitude'],man_gym['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_man)
map_man

We will remove the the gyms located in Jersey City

In [None]:
man_gym.drop(man_gym.index[16],inplace = True)
man_gym.drop(man_gym.index[28],inplace = True)
man_gym.reset_index(inplace=True)
man_gym

Retrieve the rankings of each gym.

In [None]:
man_ratings_list = []

for i in range(man_gym["ID"].count()):
    
    venue_id = man_gym.iloc[i]["ID"]
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    result = requests.get(url).json()
    
    try:
        man_ratings_list.append(result['response']['venue']['rating'])
    
    except:
        man_ratings_list.append(0)



In [None]:
man_gym["Ratings"] = pd.DataFrame(man_ratings_list)
man_mean = man_gym["Ratings"][ man_gym["Ratings"] != 0 ].mean()
man_mean
man_gym["Ratings"][man_gym["Ratings"]==0] = man_mean
man_gym

In [None]:
type(man_gym["Ratings"].values)

## Bronx Gyms

In [None]:
address = 'Bronx, NY'

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for the Bronx: {},{}".format(latitude,longitude))

In [None]:
LIMIT = 100
radius = 6000
search_query = "Fitness"

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results =requests.get(url).json()
results

In [None]:
bronx_gym = results['response']['venues']
ratings = []
bronx_gym  =json_normalize(bronx_gym)
bronx_gym

In [None]:
bronx_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Borough","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
bronx_gym = bronx_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]

bronx_gym.head()

In [None]:
map_bronx = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(bronx_gym['Latitude'],bronx_gym['Longitude'],bronx_gym['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_bronx)
map_bronx

## Ratings For BRONX

In [None]:
bronx_ratings_list = []

for i in range(bronx_gym["ID"].count()):
    
    venue_id = bronx_gym.iloc[i]["ID"]
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    result = requests.get(url).json()
    
    try:
        bronx_ratings_list.append(result['response']['venue']['rating'])
    
    except:
        bronx_ratings_list.append(0)



In [None]:
bronx_gym["Ratings"] = pd.DataFrame(bronx_ratings_list)

bronx_mean = bronx_gym["Ratings"][ bronx_gym["Ratings"] != 0 ].mean()
bronx_mean
bronx_gym["Ratings"][bronx_gym["Ratings"]==0] = bronx_mean
bronx_gym

## Queen Gyms

We repeat process for the rest of 3 Boroughs

In [None]:
address = '14167 Coolidge Ave, Queens, NY'

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for the Queens: {},{}".format(latitude,longitude))

In [None]:
LIMIT = 100
radius = 6000
search_query = "Fitness"

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results =requests.get(url).json()
results

In [None]:
queens_gym = results['response']['venues']
queens_ratings = []
queens_gym  =json_normalize(queens_gym)
queens_gym

In [None]:

queens_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Borough","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
queens_gym = queens_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]

queens_gym.head()


In [None]:
map_queens = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(queens_gym['Latitude'],queens_gym['Longitude'],queens_gym['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_queens)
map_queens

## RATINGS FOR QUEENS

In [None]:
queens_ratings_list = []

for i in range(queens_gym["ID"].count()):
    
    venue_id = queens_gym.iloc[i]["ID"]
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    result = requests.get(url).json()
    
    try:
        queens_ratings_list.append(result['response']['venue']['rating'])
    
    except:
        queens_ratings_list.append(0)




In [None]:
queens_gym["Ratings"] = pd.DataFrame(queens_ratings_list)

queens_mean = queens_gym["Ratings"][ queens_gym["Ratings"] != 0 ].mean()
queens_mean
queens_gym["Ratings"][queens_gym["Ratings"]==0] = queens_mean
queens_gym

## Brooklyn

In [None]:
address = "Brooklyn, NY"

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for the Brooklyn: {},{}".format(latitude,longitude))

In [None]:
LIMIT = 100
radius = 6000
search_query = "Fitness"

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results =requests.get(url).json()
results

In [None]:
brooklyn_gym = results['response']['venues']

brooklyn_gym  =json_normalize(brooklyn_gym)
brooklyn_gym

In [None]:

brooklyn_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Borough","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
brooklyn_gym = brooklyn_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]

brooklyn_gym.head()

brooklyn_gym.shape

In [None]:
map_brooklyn = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(brooklyn_gym['Latitude'],brooklyn_gym['Longitude'],brooklyn_gym['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_brooklyn)
map_brooklyn

In [None]:
brooklyn_ratings_list = []

for i in range(brooklyn_gym["ID"].count()):
    
    venue_id = brooklyn_gym.iloc[i]["ID"]
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    result = requests.get(url).json()
    
    try:
        brooklyn_ratings_list.append(result['response']['venue']['rating'])
    
    except:
        brooklyn_ratings_list.append(0)




In [None]:
brooklyn_gym["Ratings"] = pd.DataFrame(brooklyn_ratings_list)


In [None]:
brook_mean = brooklyn_gym["Ratings"][ brooklyn_gym["Ratings"] != 0 ].mean()
brook_mean

brooklyn_gym["Ratings"][brooklyn_gym["Ratings"]==0] = brook_mean
brooklyn_gym

## Staten Island

In [None]:
address = "Staten Island, NY"

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for the Staten Island: {},{}".format(latitude,longitude))

In [None]:
LIMIT = 100
radius = 6000
search_query = "Fitness"

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

In [None]:
results =requests.get(url).json()
results

In [None]:
staten_gym = results['response']['venues']

staten_gym  =json_normalize(staten_gym)
staten_gym

In [None]:
staten_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Borough","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
staten_gym = staten_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]

staten_gym.head()

staten_gym.shape

In [None]:
map_staten = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(staten_gym['Latitude'],staten_gym['Longitude'],staten_gym['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_staten)
map_staten

In [None]:
staten_ratings_list = []

for i in range(staten_gym["ID"].count()):
    
    venue_id = staten_gym.iloc[i]["ID"]
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)
    result = requests.get(url).json()
    
    try:
        staten_ratings_list.append(result['response']['venue']['rating'])
    
    except:
        staten_ratings_list.append(0)



In [None]:
staten_gym["Ratings"] = pd.DataFrame(staten_ratings_list)
staten_gym

In [None]:
nonzero_mean = staten_gym["Ratings"][ staten_gym["Ratings"] != 0 ].mean()
nonzero_mean

staten_gym["Ratings"][staten_gym["Ratings"]==0] = nonzero_mean

In [None]:
staten_gym

## Using the New York Technique\

In [None]:
#address = "New York City, NY"

#geolocator = Nominatim(user_agent = "ny_explorer")
#location   = geolocator.geocode(address)
#latitude   = location.latitude
#longitude  = location.longitude

#print("The geographical coordinates of New York City: {},{}".format(latitude,longitude))

In [None]:
#LIMIT = 1
#radius = 33000
#search_query = "Fitness"

##url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
#url

In [None]:
#results =requests.get(url).json()
#results

In [None]:
#ny_gym = results['response']['venues']

#ny_gym  =json_normalize(ny_gym)
#ny_gym

In [None]:
#ny_gym.rename(columns={"id":"ID","location.lat": "Latitude", "location.lng": "Longitude","location.city":"Borough","name":"Name","location.city":"Borough","location.distance":"Distance"},inplace=True)
#ny_gym = ny_gym[['ID','Name','Latitude','Longitude','Distance',"Borough"]]

##ny_gym.head()

#ny_gym.shape

## Realized we could only get 50 Results Per Query

##  First method works 


Lets replace the Nan values here with the average of the tables and then get the ratings

We concate the gyms

In [None]:
queens_gym["Borough"] = "Queens"
bronx_gym["Borough"] = "Bronx"
man_gym["Borough"] = "Manhattan"
brooklyn_gym["Borough"] = "Brooklyn"
staten_gym["Borough"] = "Staten Island"

In [None]:
ny_gyms = pd.concat([bronx_gym,man_gym,staten_gym,brooklyn_gym,queens_gym],axis = 0)

In [None]:
ny_gyms = ny_gyms[["Borough","Distance", "ID","Latitude","Longitude","Name","Ratings"]]

In [None]:
ny_gyms.groupby("Borough").count()


We then make a map of all of the gyms in New York city


In [None]:
map_ny_gyms = folium.Map(location = [latitude,longitude],zoom_start = 11)

for lat, lng, name in zip(ny_gyms['Latitude'],ny_gyms['Longitude'],ny_gyms['Name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius = 5,
                       popup = label,
                       color = 'blue',
                       fill = True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html=False).add_to(map_ny_gyms)
map_ny_gyms

In [None]:
ny_gyms.reset_index(inplace=True)

In [None]:
ny_gyms

In [None]:
ny_gyms_ratings = ny_gyms[['Borough','Ratings']]

In [None]:
ny_gyms_ratings = ny_gyms_ratings.groupby('Borough').mean().reset_index()


In [None]:
ny_gyms_ratings

## We will convert the Latitude and Longitude to x and y coordinates in order to use K-Means Clusterization


We define a function to convert the Latitude and Longitude to x and y coordinates


In [None]:
import shapely.geometry

#!pip install pyproj
import pyproj

import math

def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]

def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]
    
print("Libraries Imported Functioons Created ")

In [None]:
ny_dist = ny_gyms[["Name","Borough","Latitude","Longitude"]]

In [None]:
Xval = []
Yval = []

for i in range(ny_dist["Name"].count()):
    
    lat = ny_dist.iloc[i]["Latitude"]
    lon = ny_dist.iloc[i]["Longitude"] 
    
    X, Y = lonlat_to_xy(lat,lon)
    
    Xval.append(X)
    Yval.append(Y)




In [None]:
ny_dist["X"] = pd.DataFrame(Xval)
ny_dist["Y"] = pd.DataFrame(Yval)

In [None]:
ny_dist

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
plt.scatter(Xval,Yval)
plt.show

In [None]:
xy = pd.DataFrame({'X':Xval, 'Y': Yval})

xy.iloc[:,0]

In [None]:
kmeans = KMeans(n_clusters = 12, n_init = 20)
kmeans.fit(xy)

In [None]:
labels = kmeans.predict(xy)
centroids = kmeans.cluster_centers_
centroids

In [None]:
centroids[:,0]
centroids[:,1]

In [None]:
plt.scatter(xy['X'],xy['Y'])
plt.scatter(centroids[:,0],centroids[:,1],color = "r")


In [None]:
labels_clusters = kmeans.labels_

In [None]:
len(labels_clusters)

In [None]:
ny_gyms["Label"] = labels_clusters

In [None]:
ny_gyms

## We plot the labels of each gym as well as the overlap of the heatmap of each borough.


In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [None]:
kclusters = 15
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_gyms['Latitude'], ny_gyms['Longitude'], ny_gyms['Borough'], ny_gyms['Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

As we can see the 

In [None]:
ny_gyms

In [None]:
ny_gyms.groupby('Label').count()

# We will save the ny_gyms to a dataframe to use in our report 

In [None]:
!wget -q -O http://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/nybb/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=geojson

In [None]:
gson = r'newyork_data.json'

In [None]:
ny_gyms_ratings


In [None]:
address = "New York City, NY"

geolocator = Nominatim(user_agent = "ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude 

print("Geo Coordinates for the Staten Island: {},{}".format(latitude,longitude))

In [None]:

rating_map = folium.Map()
ratings_map = folium.Map(location = [latitude, longitude],zoom_start =11)
ratings_map

In [None]:
gson = r'borough_data.json'

In [None]:
ratings_map.choropleth(
    geo_data=gson,
    data=ny_gyms_ratings,
    columns=['Borough','Ratings'],
    key_on='feature.properties.boro_name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='NY'
)

# display map
ratings_map

In [None]:
ny_gyms_ratings

## You can see best rating of gyms are in manhattan while both Queens and Brooklyn are almost identical.

In [None]:
df.to_csv(r'gym_data.csv')