In [65]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [66]:
df = pd.read_csv('./datasets/Bus_Stop_Shelter.csv')

In [67]:
df.shape

(3330, 18)

In [68]:
df.head()

Unnamed: 0,the_geom,BoroCode,BoroName,BoroCD,CounDist,AssemDist,StSenDist,CongDist,Shelter_ID,Corner,On_Street,Cross_Stre,Longitude,Latitude,NTAName,FEMAFldz,FEMAFldT,HrcEvac
0,POINT (-73.94874357365637 40.64085475597672),3,Brooklyn,317,45,42,21,9,BR02218,SW,AV D,NOSTRAND AV,-73.948745,40.640863,East Flatbush-Erasmus,X,AREA OF MINIMAL FLOOD HAZARD,5.0
1,POINT (-74.02283555116742 40.6292447605423),3,Brooklyn,310,43,46,26,11,BR02219,SW,5 AV,BAY RIDGE PKWY,-74.022837,40.629253,Bay Ridge,X,AREA OF MINIMAL FLOOD HAZARD,
2,POINT (-73.92467258003798 40.67375874807616),3,Brooklyn,308,41,55,20,9,BR02220,SE,ST MARKS AV,BUFFALO AV,-73.924674,40.673767,Crown Heights (North),X,AREA OF MINIMAL FLOOD HAZARD,
3,POINT (-73.9401345806071 40.578033768126936),3,Brooklyn,315,48,45,23,8,BR02221,SW,ORIENTAL BLVD,MACKENZIE ST,-73.940136,40.578042,Sheepshead Bay-Manhattan Beach-Gerritsen Beach,X,AREA OF MINIMAL FLOOD HAZARD,1.0
4,POINT (-73.93991658060635 40.57805676808165),3,Brooklyn,315,48,45,23,8,BR02222,SW,ORIENTAL BLVD,MACKENZIE ST,-73.939918,40.578065,Sheepshead Bay-Manhattan Beach-Gerritsen Beach,X,AREA OF MINIMAL FLOOD HAZARD,1.0


In [69]:
#drop all columns except for BoroCode, BoroName, and Longitude, Latitude
df = df[['NTAName','BoroName', 'Longitude', 'Latitude']]

In [70]:
#get unique BoroName
df['BoroName'].unique()

array(['Brooklyn', 'Queens', 'Bronx', 'Manhattan', 'Staten Island'],
      dtype=object)

In [71]:
#drop all non Manhattan rows
df = df[df['BoroName'] == 'Manhattan']

In [72]:
df.shape

(720, 4)

In [73]:
df.head()

Unnamed: 0,NTAName,BoroName,Longitude,Latitude
898,East Harlem (South),Manhattan,-73.94616,40.789307
899,East Harlem (South),Manhattan,-73.944332,40.79182
900,Harlem (South),Manhattan,-73.952176,40.803386
901,Harlem (South),Manhattan,-73.952078,40.802923
902,Harlem (South),Manhattan,-73.955667,40.80419


In [74]:
#assign taxi zones to locations.
# Load GeoJSON data into a GeoDataFrame
def load_geojson_gpd(filepath):
    return gpd.read_file(filepath)

# Function to find zones using spatial join in geopandas
def assign_zones(df, gdf):
    # Convert DataFrame to GeoDataFrame
    gdf_points = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))
    gdf_points.set_crs(gdf.crs, inplace=True)

    # Spatial join points to polygons
    joined = gpd.sjoin(gdf_points, gdf, how="left", predicate='within')
    return joined['location_id']



In [75]:
# Load GeoJSON data into a GeoDataFrame
geo_df = load_geojson_gpd('./datasets/NYC Taxi Zones.geojson')

# Assign zones
df['zone_id'] = assign_zones(df, geo_df)

df.head()

Unnamed: 0,NTAName,BoroName,Longitude,Latitude,zone_id
898,East Harlem (South),Manhattan,-73.94616,40.789307,75
899,East Harlem (South),Manhattan,-73.944332,40.79182,75
900,Harlem (South),Manhattan,-73.952176,40.803386,41
901,Harlem (South),Manhattan,-73.952078,40.802923,41
902,Harlem (South),Manhattan,-73.955667,40.80419,41


In [76]:
#get unique zone ids
all_zones = df['zone_id'].unique()
#change type to str
all_zones = [str(x) for x in all_zones]

In [77]:
all_zones = set(all_zones)


In [78]:
zones_df = pd.read_csv('./datasets/zones_df.csv')

In [79]:
#get a list of the unique zone_ids as a list
our_zones = zones_df['zone_id'].unique()
our_zones = [str(x) for x in our_zones]
our_zones = set(our_zones)
our_zones

{'100',
 '107',
 '113',
 '114',
 '116',
 '12',
 '120',
 '125',
 '127',
 '128',
 '13',
 '137',
 '140',
 '141',
 '142',
 '143',
 '144',
 '148',
 '151',
 '152',
 '153',
 '158',
 '161',
 '162',
 '163',
 '164',
 '166',
 '170',
 '186',
 '194',
 '202',
 '209',
 '211',
 '224',
 '229',
 '230',
 '231',
 '232',
 '233',
 '234',
 '236',
 '237',
 '238',
 '239',
 '24',
 '243',
 '244',
 '246',
 '249',
 '261',
 '262',
 '263',
 '4',
 '41',
 '42',
 '43',
 '45',
 '48',
 '50',
 '68',
 '74',
 '75',
 '79',
 '87',
 '88',
 '90'}

In [83]:
zones_to_remove = all_zones - our_zones 
zones_to_remove

set()

In [92]:
len(our_zones)

66

In [84]:
#remove rows in df that have zone ids in the zones_to_remove set
df = df[~df['zone_id'].isin(zones_to_remove)]

In [86]:
#export to csv
df.to_csv('./datasets/bus_stop_shelters_with_zones.csv', index=False)

In [87]:
#aggregate bus stops by zone adding up the number of bus stops in each zone
df['count'] = 1
df = df.groupby('zone_id').count().reset_index()


In [89]:
#drop all columns except for zone_id and count
df = df[['zone_id', 'count']]

In [90]:
df.head()

Unnamed: 0,zone_id,count
0,107,15
1,113,2
2,114,3
3,116,18
4,120,4


In [95]:
#make sure we have all zones in the df
missing_zones_count = []
for zone in our_zones:
    if zone not in df['zone_id'].values:
        d = {'zone_id': zone, 'count': 0}
        missing_zones_count.append(d)

missing_zones_count = pd.DataFrame(missing_zones_count)
df = pd.concat([df, missing_zones_count])

In [96]:
#export to csv
df.to_csv('./datasets/count_bus_stop_shelters_by_zone.csv', index=False)