In [6]:
import numpy as np
import pandas as pd
import json
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
print('Everything has been imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##

In [7]:
import folium
CLIENT_ID = ''
SECRET = ''
VERSION = '20190830'

We will first import the needed files for analysis.

In [8]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset

In [9]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [10]:
neighborhoods_data = newyork_data['features']

In [11]:
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)

In [12]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [13]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [14]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent='ny_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [15]:
map_ny = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)

map_ny

Above we can see the localization of the neighborhoods in New York. It is helpful because based on this map and other data we will choose the best place to build a hotel.

We will now load the other datasets that are relevant for our cause: NYPD Complaints Dataset, New York Hotels Dataset and the Airbnb Dataset. We will analyze these datasets to be able to provide suggestions.

In [16]:
nypd = pd.read_csv('NYPD complaint.csv')
nypd.head()

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,...,ADDR_PCT_CD,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,101109527,12/31/2015,23:45:00,,,12/31/2015,113,FORGERY,729.0,"FORGERY,ETC.,UNCLASSIFIED-FELO",...,44,INSIDE,BAR/NIGHT CLUB,,,1007314.0,241257.0,40.828848,-73.916661,"(40.828848333, -73.916661142)"
1,153401121,12/31/2015,23:36:00,,,12/31/2015,101,MURDER & NON-NEGL. MANSLAUGHTER,,,...,103,OUTSIDE,,,,1043991.0,193406.0,40.697338,-73.784557,"(40.697338138, -73.784556739)"
2,569369778,12/31/2015,23:30:00,,,12/31/2015,117,DANGEROUS DRUGS,503.0,"CONTROLLED SUBSTANCE,INTENT TO",...,28,,OTHER,,,999463.0,231690.0,40.802607,-73.945052,"(40.802606608, -73.945051911)"
3,968417082,12/31/2015,23:30:00,,,12/31/2015,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,...,105,INSIDE,RESIDENCE-HOUSE,,,1060183.0,177862.0,40.654549,-73.726339,"(40.654549444, -73.726338791)"
4,641637920,12/31/2015,23:25:00,12/31/2015,23:30:00,12/31/2015,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,...,13,FRONT OF,OTHER,,,987606.0,208148.0,40.738002,-73.987891,"(40.7380024, -73.98789129)"


In [17]:
short_nypd = nypd[['OFNS_DESC', 'Latitude', 'Longitude']]
short_nypd

Unnamed: 0,OFNS_DESC,Latitude,Longitude
0,FORGERY,40.828848,-73.916661
1,MURDER & NON-NEGL. MANSLAUGHTER,40.697338,-73.784557
2,DANGEROUS DRUGS,40.802607,-73.945052
3,ASSAULT 3 & RELATED OFFENSES,40.654549,-73.726339
4,ASSAULT 3 & RELATED OFFENSES,40.738002,-73.987891
...,...,...,...
1010,OFF. AGNST PUB ORD SENSBLTY &,40.808374,-73.926996
1011,FRAUDS,40.863773,-73.922196
1012,OFF. AGNST PUB ORD SENSBLTY &,40.701986,-73.927964
1013,OFF. AGNST PUB ORD SENSBLTY &,40.718564,-73.988201


In [19]:
short_nypd.rename({'OFNS_DESC': 'Description'}, axis=1, inplace=True)
short_nypd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,Description,Latitude,Longitude
0,FORGERY,40.828848,-73.916661
1,MURDER & NON-NEGL. MANSLAUGHTER,40.697338,-73.784557
2,DANGEROUS DRUGS,40.802607,-73.945052
3,ASSAULT 3 & RELATED OFFENSES,40.654549,-73.726339
4,ASSAULT 3 & RELATED OFFENSES,40.738002,-73.987891
...,...,...,...
1010,OFF. AGNST PUB ORD SENSBLTY &,40.808374,-73.926996
1011,FRAUDS,40.863773,-73.922196
1012,OFF. AGNST PUB ORD SENSBLTY &,40.701986,-73.927964
1013,OFF. AGNST PUB ORD SENSBLTY &,40.718564,-73.988201


In [20]:
short_nypd.dropna(axis=0, inplace=True)
short_nypd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Description,Latitude,Longitude
0,FORGERY,40.828848,-73.916661
1,MURDER & NON-NEGL. MANSLAUGHTER,40.697338,-73.784557
2,DANGEROUS DRUGS,40.802607,-73.945052
3,ASSAULT 3 & RELATED OFFENSES,40.654549,-73.726339
4,ASSAULT 3 & RELATED OFFENSES,40.738002,-73.987891
...,...,...,...
1010,OFF. AGNST PUB ORD SENSBLTY &,40.808374,-73.926996
1011,FRAUDS,40.863773,-73.922196
1012,OFF. AGNST PUB ORD SENSBLTY &,40.701986,-73.927964
1013,OFF. AGNST PUB ORD SENSBLTY &,40.718564,-73.988201


Here we have a dataframe consisting of 999 different crimes committed in New York.

In [21]:
map_crimes = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng in zip(short_nypd['Latitude'], short_nypd['Longitude']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_crimes)  
    
map_crimes

Based on the map above, which shows 999 crimes, we can clearly see that the smallest amount of them have been committed in State Island. This might be a good place to build a hotel based on Crime Data.

We will now load the Hotels Dataset to see where are they placed in New York.

In [22]:
hotels = pd.read_csv('new_york_hotels.csv', encoding = 'unicode_escape')
hotels

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.0100,134.0000
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.72410,-73.79822,3.0,134.1700,84.1600
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.3900,89.3900
...,...,...,...,...,...,...,...,...,...,...,...
1626,324259,Residence Inn Yonkers Westchester County,7 Executive Blvd,Yonkers,NY,10701,40.97275,-73.88075,3.0,269.3600,169.3600
1627,107949,Ramada Inn Yonkers,125 Tuckahoe Rd,Yonkers,NY,10710,40.95466,-73.86483,2.5,129.0000,119.0000
1628,509723,Hyatt Place New York/Yonkers,7000 Mall Walk,Yonkers,NY,10704,40.92625,-73.85438,3.0,249.3100,199.3100
1629,621870,Hampton Inn & Suites Yonkers - Westchester,555 Tuckahoe Rd,Yonkers,NY,10710,40.95375,-73.84935,2.5,189.1900,134.1800


In [23]:
short_hotels = hotels[['city', 'latitude', 'longitude']]
short_hotels.head()

Unnamed: 0,city,latitude,longitude
0,Albany,42.68751,-73.81643
1,Albany,42.68971,-73.82021
2,Albany,42.7241,-73.79822
3,Albany,42.65157,-73.77638
4,Albany,42.68873,-73.81854


In [24]:
short_hotels1 = short_hotels[short_hotels.city == 'New York']
short_hotels2 = short_hotels[short_hotels.city == 'Staten Island']

In [29]:
short_hotels3 = short_hotels1.append(short_hotels2)
short_hotels3

Unnamed: 0,city,latitude,longitude
787,New York,40.74624,-73.97929
788,New York,40.76452,-73.98078
789,New York,40.76366,-73.97965
790,New York,40.74929,-73.97670
791,New York,40.72102,-74.00418
...,...,...,...
1479,Staten Island,40.61286,-74.17925
1480,Staten Island,40.64200,-74.07902
1481,Staten Island,40.61548,-74.06338
1482,Staten Island,40.63676,-74.12343


In [45]:
map_hotels = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng in zip(short_hotels3['latitude'], short_hotels3['longitude']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_hotels)  
    
map_hotels

Above is a map of the hotels in the New York area. Again, Staten Island looks good when it comes to building a hotel, but it's not the only place that's promising: Manhattan, especially along Central Park seems to be a good spot too.

However, hotels are not the only places tourists can rent a place. Nowadays a multitude of people choose other methods of finding a place to rent. Such an example would be Airbnb.

In the following lines we will explore the location of 1000 Airbnb associates that have decided to rent their properties in New York City. Please note that the entire dataset contains 50000+ entries of properties, but due to the size of the actual dataset I have decided to load a set of entries that would give us a good idea of the general property localization.

In [32]:
air = pd.read_csv('Airbnb.csv')
air.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,10/19/2018,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,5/21/2019,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,7/5/2019,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,11/19/2018,0.1,1,0


In [34]:
short_air = air[['neighbourhood', 'latitude', 'longitude', 'availability_365']]
short_air.head(10)

Unnamed: 0,neighbourhood,latitude,longitude,availability_365
0,Kensington,40.64749,-73.97237,365
1,Midtown,40.75362,-73.98377,355
2,Harlem,40.80902,-73.9419,365
3,Clinton Hill,40.68514,-73.95976,194
4,East Harlem,40.79851,-73.94399,0
5,Murray Hill,40.74767,-73.975,129
6,Bedford-Stuyvesant,40.68688,-73.95596,0
7,Hell's Kitchen,40.76489,-73.98493,220
8,Upper West Side,40.80178,-73.96723,0
9,Chinatown,40.71344,-73.99037,188


We will drop the rows that correspond to 0 availability.

In [40]:
short_air1 = short_air[short_air.availability_365 != 0]
short_air1.head(10)

Unnamed: 0,neighbourhood,latitude,longitude,availability_365
0,Kensington,40.64749,-73.97237,365
1,Midtown,40.75362,-73.98377,355
2,Harlem,40.80902,-73.9419,365
3,Clinton Hill,40.68514,-73.95976,194
5,Murray Hill,40.74767,-73.975,129
7,Hell's Kitchen,40.76489,-73.98493,220
9,Chinatown,40.71344,-73.99037,188
10,Upper West Side,40.80316,-73.96545,6
11,Hell's Kitchen,40.76076,-73.98867,39
12,South Slope,40.66829,-73.98779,314


In [43]:
short_air1['availability_365'].mean()

212.03703703703704

The properties listed on Airbnb are available, on average, 212 day a year.

Now let's see the location of these properties in New York.

In [44]:
map_air = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng in zip(short_air1['latitude'], short_air1['longitude']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_air)  
    
map_air

Staten Island remains a point of interest for our customer, but along the Central Park we can see a breathing space, especially in the south. This might be a good place for a hotel. Also the southern part of New York seems to have few Airbnb listed properties, even though it's far from downtown.

Now that we have established a couple of focal points through the interactive maps, we can load data about the venues around these spots. We will be interested in Central Park and Staten Island.

In [51]:
staten_island = neighborhoods[neighborhoods.Borough == 'Staten Island']
staten_island

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
197,Staten Island,St. George,40.644982,-74.079353
198,Staten Island,New Brighton,40.640615,-74.087017
199,Staten Island,Stapleton,40.626928,-74.077902
200,Staten Island,Rosebank,40.615305,-74.069805
201,Staten Island,West Brighton,40.631879,-74.107182
...,...,...,...,...
287,Staten Island,Egbertville,40.579119,-74.127272
291,Staten Island,Prince's Bay,40.526264,-74.201526
292,Staten Island,Lighthouse Hill,40.576506,-74.137927
293,Staten Island,Richmond Valley,40.519541,-74.229571


There are 63 neighborhoods in Staten Island, but we are only interested in the ones located in the North, because it's closer to downtown New York. New Brighton for example could be a good place.

In [52]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    SECRET, 
    VERSION, 
    40.640615, 
    -74.087017, 
    radius, 
    LIMIT)

In [70]:
results = requests.get(url).json()

In [54]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [56]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Family Dollar,Discount Store,40.63871,-74.087767
1,MTA Bus - Jersey St & Hendricks Av (S42/S52),Bus Stop,40.640167,-74.087157
2,MTA Bus - Jersey St & Scribner Av (S52),Bus Stop,40.64017,-74.087182
3,Hoebowl Bowling Center,Bowling Alley,40.640901,-74.085095
4,MTA Bus - Jersey St & Hendricks Av (S52/S42),Bus Stop,40.640028,-74.087138
5,MTA Bus - Jersey St & Layton Av (S42/S52),Bus Stop,40.641467,-74.08669
6,Lopez Grocery,Deli / Bodega,40.64008,-74.087286
7,Bocce Courts,Park,40.6398,-74.09
8,Skyline Playground,Playground,40.63919,-74.089867
9,AK Food Deli,Deli / Bodega,40.63798,-74.08698


In [77]:
map_venues_staten = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng in zip(nearby_venues['lat'], nearby_venues['lng']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_venues_staten)  
    
map_venues_staten

We can see that in the area there are a number of venues: parks, bus stops and stores. By the number of the venues we can see that this is not a very populated area. Also, we need to mention the fact that the hotel would be quite far from downtown, which can drastically decrease the price.

Now let's try the same thing with Central Park and we'll make a comparison.

In [61]:
central = neighborhoods[neighborhoods.Borough == 'Manhattan']
central.head(20)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
6,Manhattan,Marble Hill,40.876551,-73.91066
100,Manhattan,Chinatown,40.715618,-73.994279
101,Manhattan,Washington Heights,40.851903,-73.9369
102,Manhattan,Inwood,40.867684,-73.92121
103,Manhattan,Hamilton Heights,40.823604,-73.949688
104,Manhattan,Manhattanville,40.816934,-73.957385
105,Manhattan,Central Harlem,40.815976,-73.943211
106,Manhattan,East Harlem,40.792249,-73.944182
107,Manhattan,Upper East Side,40.775639,-73.960508
108,Manhattan,Yorkville,40.77593,-73.947118


We are interested in Midtown, since it's a highly populated zone and an interest point.

In [64]:
LIMIT = 50
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    SECRET, 
    VERSION, 
    40.754691,
    -73.981669, 
    radius, 
    LIMIT)

In [69]:
results1 = requests.get(url).json()

In [68]:
venues1 = results1['response']['groups'][0]['items']
    
nearby_venues1 = json_normalize(venues1)

filtered_columns1 = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues1 = nearby_venues1.loc[:, filtered_columns1]

nearby_venues1['venue.categories'] = nearby_venues1.apply(get_category_type, axis=1)

nearby_venues1.columns = [col.split(".")[-1] for col in nearby_venues1.columns]

nearby_venues1.head(20)

Unnamed: 0,name,categories,lat,lng
0,Bryant Park,Park,40.753621,-73.983265
1,New York Public Library Terrace,Plaza,40.753017,-73.98148
2,Nat Sherman Townhouse,Smoke Shop,40.753283,-73.980358
3,sweetgreen,Salad Place,40.75464,-73.983102
4,Joanna Vargas Skin Care,Spa,40.753136,-73.980721
5,Equinox East 43rd Street,Gym,40.754089,-73.9799
6,Sofitel New York,Hotel,40.755787,-73.981762
7,Xi'an Famous Foods,Chinese Restaurant,40.755926,-73.980751
8,COS,Clothing Store,40.753678,-73.980686
9,NBA Store,Sporting Goods Shop,40.755305,-73.979377


In [79]:
map_venues_man = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng in zip(nearby_venues1['lat'], nearby_venues1['lng']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_venues_man)  
    
map_venues_man

We can clearly see that this is a very populated area based on the number of venues and their diversity. A hotel here would definitely be successful but the area will increase the price of building and maintaining.