# Battle of Neighbourhood - New York Data processing

In [1]:
# Import the required libraries
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
import json
import wget
from geopy.geocoders import Nominatim
import folium
import requests
%matplotlib inline

In [2]:
# prepare the Url for the newyork Json file 
url = 'https://cocl.us/new_york_dataset/newyork_data.json'
file = wget.download(url)
print('\nFile download completed')

100% [............................................................................] 115774 / 115774
File download completed


In [3]:
# read the json file
with open(file) as json_data:
    NY_data = json.load(json_data)
print("\nNY file read successful")


NY file read successful


In [4]:
NY_Neighbour_data = NY_data['features']

In [5]:
NY_Neighbour_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [6]:
NY_Neighbour_data[0]['geometry']['coordinates'] ## Latitude and Longitude information
NY_Neighbour_data[0]['properties']['name'] ## Newighbourhood name
NY_Neighbour_data[0]['properties']['borough'] ## Borough Name

'Bronx'

In [7]:
cols = ['Borough','Neighbourhood','Latitude','Longitude']
NY_df = pd.DataFrame(columns=cols)

In [8]:
for data in NY_Neighbour_data:
    Borough   =  data['properties']['borough'] 
    Neighbour =  data['properties']['name']
    Lat       =  data['geometry']['coordinates'][1]
    Lng       =  data['geometry']['coordinates'][0]
    NY_df = NY_df.append({'Borough':Borough,
                          'Neighbourhood':Neighbour,
                          'Latitude':Lat,
                          'Longitude':Lng},ignore_index=True)
NY_df.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [9]:
# write to the output csv file
NY_df.to_csv('NY_city_Neighbour_Lat_Lng.csv',encoding='utf-8',index=False)

In [36]:
NY_df.dtypes

Borough           object
Neighbourhood     object
Latitude         float64
Longitude        float64
dtype: object

In [48]:
# Get the latitude and longitude of New york and get these Neighbourhood markered onver it.
address = 'New York City,NY'
geolocator = Nominatim(user_agent='pu_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f"The coordinates of New York City is {latitude},{longitude}")

The coordinates of New York City is 40.7127281,-74.0060152


In [52]:
map_NY = folium.Map(location=(latitude,longitude),zoom_start=10)
for name,neighbour,lat,lng in zip(NY_df['Borough'],NY_df['Neighbourhood'],NY_df['Latitude'],NY_df['Longitude']):
    label=(f"{neighbour},{name}")
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker(
        [lat,lng],radius=5,color='blue',fill=True).add_to(map_NY)

map_NY
    

In [68]:
# get the venues from the four square api
CLIENT_ID = '50UMKE0OQMLO2NQF0ERLPROW4J42AA2TTQ0SZJ2S230H3KIO'
CLIENT_SECRET = '5E0ML3XE2KKTLCAIQJVQLEKQ3NAFCYGW1U31FEQ3DRVPIHM2'
VERSION = '20200309'
RADIUS = 500
LIMIT = 100

In [74]:
# NY_Venue_List[0]
NY_df[NY_df['Latitude'] == 40.68988687915789]

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
146,Queens,Woodhaven,40.689887,-73.85811


In [80]:
NY_df_new = NY_df.drop(index=146,axis=0)

In [86]:
NY_df_new.reset_index(drop=True,inplace=True)

In [87]:
# Get the venue details for each of the neighbourhood using foursquare api
NY_Venue_List=[]
for neighbour,lat,lng in zip(NY_df_new['Neighbourhood'],NY_df_new['Latitude'],NY_df_new['Longitude']):
    url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&ll={lat},{lng}&radius={RADIUS}&limit={LIMIT}'
#    print(url)
    results = requests.get(url).json()['response']['groups'][0]['items']
    for venue in results:
        NY_Venue_List.append([(neighbour,
                            lat,
                            lng,
                            venue['venue']['name'],
                            venue['venue']['location']['lat'],
                            venue['venue']['location']['lng'],
                            venue['venue']['categories'][0]['name'])])
print("This completes the Four sqaure api call")

This completes the Four sqaure api call


In [108]:
NY_Venue_List[0][0]

('Wakefield',
 40.89470517661,
 -73.84720052054902,
 'Lollipops Gelato',
 40.894123150205274,
 -73.84589162362325,
 'Dessert Shop')

In [89]:
NY_Venue_df = pd.DataFrame(NY_Venue_List)

In [91]:
NY_Venue_List[0][0]

('Wakefield',
 40.89470517661,
 -73.84720052054902,
 'Lollipops Gelato',
 40.894123150205274,
 -73.84589162362325,
 'Dessert Shop')

In [140]:
temp=[]
for venue in range(len(NY_Venue_List)):
    temp.append(NY_Venue_List[venue][0])

In [141]:
temp[0]

('Wakefield',
 40.89470517661,
 -73.84720052054902,
 'Lollipops Gelato',
 40.894123150205274,
 -73.84589162362325,
 'Dessert Shop')

In [145]:
NY_Venue_df = pd.DataFrame(item for item in temp)

In [147]:
NY_Venue_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Walgreens,40.896687,-73.84485,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


In [151]:
NY_Venue_df.columns=['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','V_Category']

In [152]:
NY_Venue_df.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,V_Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Walgreens,40.896687,-73.84485,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


In [174]:
NY_Venue_df_New = NY_Venue_df.drop(columns=['N_Latitude','N_Longitude'])

In [158]:
##NY_Venue_df.to_csv("NY_Venue_data.csv",columns=(["Neighbourhood","Venue","V_Latitude","V_Longitude","V_Category"]),index=False)

In [192]:
## Do one hot encoding on Category
NY_Venue_df_New = pd.get_dummies(NY_Venue_df,columns=['V_Category'],drop_first=True).drop(columns=['N_Latitude','N_Longitude'])

In [195]:
NY_Venue_df_New.drop(columns=['V_Latitude','V_Longitude'],inplace=True)

In [196]:
NY_Venue_df_New = NY_Venue_df_New.groupby('Neighbourhood').mean()

In [198]:
NY_Venue_df_New.shape

(299, 432)

In [199]:
NY_Venue_df_New.to_csv("NewYork_Venue_data.csv")