# API Call Stuff

In [2]:
from sodapy import Socrata
import pandas as pd
import geopandas as gpd
import plotly.express as px
import h3
import os
from dotenv import load_dotenv
from folium import Map, Marker, GeoJson
import folium
from geojson.feature import Feature, FeatureCollection
import geojson
import json
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

In [3]:

load_dotenv()

soda_token = os.getenv("SODA_TOKEN")

client = Socrata("data.cityofnewyork.us", app_token=soda_token)

In [6]:
results = client.get("uacg-pexx", limit=500000)

results_df = pd.DataFrame.from_records(results)

In [7]:
results_df = results_df[(results_df['pickup_longitude'] != 0) & (results_df['pickup_latitude'] != 0)]

# Visualizations
KDE over hour, day, and month of pickups across Manhattan

In [10]:
results_df['tpep_pickup_datetime'] = pd.to_datetime(results_df['tpep_pickup_datetime'])
results_df['tpep_dropoff_datetime'] = pd.to_datetime(results_df['tpep_dropoff_datetime'])

results_df['pickup_hour'] = results_df['tpep_pickup_datetime'].dt.hour
results_df['pickup_day'] = results_df['tpep_pickup_datetime'].dt.day_of_week
results_df['pickup_month'] = results_df['tpep_pickup_datetime'].dt.month
results_df['date'] = results_df['tpep_pickup_datetime'].dt.date

In [None]:
fig = px.density_map(
    results_df,
    lat='pickup_latitude',
    lon='pickup_longitude',
    animation_frame='pickup_hour',
    category_orders={'pickup_hour': sorted(results_df['pickup_hour'].unique())},
    radius=2,
    z=None,
    center={'lat': 40.73, 'lon': -74.0},
    zoom=10.5,
    map_style='carto-positron',
    title='Density of Pickup Locations By Hour',
    height=800
)

fig.show()

In [None]:
fig = px.density_map(
    results_df,
    lat='pickup_latitude',
    lon='pickup_longitude',
    animation_frame='pickup_day',
    category_orders={'pickup_day': sorted(results_df['pickup_day'].unique())},
    radius=2,
    z=None,
    center={'lat': 40.73, 'lon': -74.0},
    zoom=10.5,
    map_style='carto-positron',
    title='Density of Pickup Locations By Day',
    height=800
)

fig.show()

In [None]:
fig = px.density_map(
    results_df,
    lat='pickup_latitude',
    lon='pickup_longitude',
    animation_frame='pickup_month',
    category_orders={'pickup_month': sorted(results_df['pickup_month'].unique())},
    radius=2,
    z=None,
    center={'lat': 40.73, 'lon': -74.0},
    zoom=10.5,
    map_style='carto-positron',
    title='Density of Pickup Locations By Month',
    height=800
)

fig.show()

# Hex Binning
The KDEs didn't really allow us to distinguish the peak values the way that I would want them to, so lets use a hexbinning method, which would also allow us to visualize frequencies and also cluster over a discrete set of geographies.

For each of the dataframes, I will be translating each of the longitude latitude pairs into their respective hex cells, and calculating the average number of trips from each of the cell ids and between each of the cell-id pairs by hour, day of week, and month.

In [7]:
hr_average_pu = {}
hr_average_do = {}
hr_average_pu_do = {}

day_average_pu = {}
day_average_do = {}
day_average_pu_do = {}

month_average_pu = {}
month_average_do = {}
month_average_pu_do = {}

In [8]:
results_df['pickup_latitude'] = results_df['pickup_latitude'].astype(float)
results_df['pickup_longitude'] = results_df['pickup_longitude'].astype(float)
results_df['dropoff_latitude'] = results_df['dropoff_latitude'].astype(float)
results_df['dropoff_longitude'] = results_df['dropoff_longitude'].astype(float)

In [9]:
results_df['pu_cell_id'] = results_df.apply(lambda row: h3.latlng_to_cell(
    lat=row['pickup_latitude'],
    lng=row['pickup_longitude'],
    res=10
), axis=1)

results_df['do_cell_id'] = results_df.apply(lambda row: h3.latlng_to_cell(
    lat=row['dropoff_latitude'],
    lng=row['dropoff_longitude'],
    res=10
), axis=1)
results_df['pu_do_ids'] = results_df['pu_cell_id'].str.cat(results_df['do_cell_id'], sep="_")

In [15]:
results_df.head()

Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,ratecodeid,store_and_fwd_flag,dropoff_longitude,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_hour,pickup_day,pickup_month,pu_cell_id,do_cell_id,pu_do_ids
0,2,2016-02-19 19:26:14,2016-02-19 19:49:33,1,9.36,-73.984192,40.76701,1,N,-73.926651,...,2.5,0,0.3,32.3,19,4,2,8a2a1008b24ffff,8a2a100a1607fff,8a2a1008b24ffff_8a2a100a1607fff
1,1,2016-02-05 18:00:30,2016-02-05 18:05:47,2,0.8,-73.96489,40.769829,1,N,-73.954903,...,1.5,0,0.3,8.8,18,4,2,8a2a100d6917fff,8a2a10089247fff,8a2a100d6917fff_8a2a10089247fff
2,2,2016-02-21 01:16:58,2016-02-21 01:25:32,1,2.12,-73.921471,40.755966,1,N,-73.886528,...,0.0,0,0.3,10.3,1,6,2,8a2a100f360ffff,8a2a100f1297fff,8a2a100f360ffff_8a2a100f1297fff
3,2,2016-02-27 13:50:17,2016-02-27 13:56:02,1,0.5,-73.991737,40.729568,1,N,-73.987427,...,1.26,0,0.3,7.56,13,5,2,8a2a1072c827fff,8a2a100d22affff,8a2a1072c827fff_8a2a100d22affff
4,2,2016-02-05 21:01:45,2016-02-05 21:27:48,1,4.78,-73.951576,40.778469,1,N,-73.998528,...,4.16,0,0.3,24.96,21,4,2,8a2a10089aa7fff,8a2a1072c8b7fff,8a2a10089aa7fff_8a2a1072c8b7fff


In [17]:
pu_cell_id_by_hr = results_df.groupby('pu_cell_id')['pickup_hour'].value_counts()
pu_cell_id_by_day = results_df.groupby('pu_cell_id')['pickup_day'].value_counts()
pu_cell_id_by_month = results_df.groupby('pu_cell_id')['pickup_month'].value_counts()

do_cell_id_by_hr = results_df.groupby('do_cell_id')['pickup_hour'].value_counts()
do_cell_id_by_day = results_df.groupby('do_cell_id')['pickup_day'].value_counts()
do_cell_id_by_month = results_df.groupby('do_cell_id')['pickup_month'].value_counts()

pu_do_id_by_hr = results_df.groupby('pu_do_ids')['pickup_hour'].value_counts()
pu_do_id_by_day = results_df.groupby('pu_do_ids')['pickup_day'].value_counts()
pu_do_id_by_month = results_df.groupby('pu_do_ids')['pickup_month'].value_counts()

In [11]:
def merge_grouped_value_counts(dict_a, dict_b):
    keys = dict_a.keys() | dict_b.keys()
    print(keys)

In [16]:
type(pu_cell_id_by_hr)

pandas.core.indexes.multi.MultiIndex

In [1]:
pu_cell_id_by_hr['8a2a1000062ffff']

NameError: name 'pu_cell_id_by_hr' is not defined

## Averaging Traffic by Hex Bin
In order to establish the "average" number of pickups, dropoffs, and pickup/dropoff pairs at a given hour, we need to be able to divide by the number of unique hours/days/months that appear in the dataset. I don't know for sure that its 

In [56]:
merge_grouped_value_counts(pu_cell_id_by_hr, hr_average_pu)

  keys = dict_a.keys() | dict_b.keys()


ValueError: operands could not be broadcast together with shapes (55456,) (0,) 

## Mapping
After calculating and aggregating trip counts by the hex cells, I will go ahead and visualize the hex bins in choropleth form, normalized by the highest and lowest values present in any of the time buckets of a given type (i.e. day, month, hour)

In [25]:
RESOLUTION = 10

h = h3.latlng_to_cell(lat = 40.73, lng = -74.0, res=RESOLUTION)

h_geom = h3.cells_to_geo(cells = [h])

hex_bin = {"res": RESOLUTION, "geometry": h_geom}

map_test = Map(location = [40.73, -74.0],
                  zoom_start = 10.5,
                  tiles = "cartodbpositron",
                  attr = '''© <a href="http://www.openstreetmap.org/copyright">
                          OpenStreetMap</a>contributors ©
                          <a href="http://cartodb.com/attributions#basemaps">
                          CartoDB</a>'''
                  )


hex_feature = Feature(geometry = hex_bin["geometry"],
                    id = 1,
                    properties = {"resolution": int(hex_bin["res"])})

geojson_result = json.dumps(hex_feature)


GeoJson(
        geojson_result,
        style_function = lambda feature: {
            'fillColor': None,
            'color': "green",
            'weight': 2,
            'fillOpacity': 0.05
        },
        name = "Example"
    ).add_to(map_test)

map_test.save(f'maps/map_test_{RESOLUTION}.html')

In [16]:
nyc_boundary = None
with open("geo/nyc_boroughs.json") as geo:
    nyc_boundary = geojson.load(geo)

In [27]:
boro_cells = {}
for i in range(len(nyc_boundary['features'])):
    boro_id = nyc_boundary['features'][i]['properties']['BoroName']
    boro_cells[boro_id] = h3.geo_to_cells(nyc_boundary['features'][i]['geometry'], res=10)

boro_feature_colls = {}
for id, cells in boro_cells.items():
    feature_list = []
    for cell in cells:
        cell_geom = [[lon, lat] for lat, lon in h3.cell_to_boundary(cell)]
        cell_geom.append(cell_geom[0])
        cell_feature = Feature(id=cell, geometry={'type': 'Polygon', 'coordinates': [cell_geom]})
        feature_list.append(cell_feature)

    feature_col = FeatureCollection(feature_list)
    boro_feature_colls[id] = feature_col

In [19]:
pu_cell_id_by_hr[0]

pu_cell_id
8a754e64992ffff    305
8a2a103b1c97fff    129
8a2a10721b1ffff    103
8a2a1072cd07fff     83
8a2a1072cd57fff     74
                  ... 
8a2a10775b9ffff      1
8a2a1088976ffff      1
8a2a10889a57fff      1
8a2a10c46a07fff      1
8a2a10c5bc27fff      1
Name: count, Length: 2413, dtype: int64

In [31]:
map_hex = Map(location = [40.73, -74.0],
                  zoom_start = 10.5,
                  tiles = "cartodbpositron",
                  attr = '''© <a href="http://www.openstreetmap.org/copyright">
                          OpenStreetMap</a>contributors ©
                          <a href="http://cartodb.com/attributions#basemaps">
                          CartoDB</a>'''
                  )

GeoJson(nyc_boundary,
        style_function = lambda feature: {
            'fillColor': None,
            'color': "yellow",
            'weight': 2
        },
        name = "NYC Boroughs"
        ).add_to(map_hex)

<folium.features.GeoJson at 0x31374a650>

In [32]:
# map_hex = Map(location = [40.73, -74.0],
#                   zoom_start = 10.5,
#                   tiles = "cartodbpositron",
#                   attr = '''© <a href="http://www.openstreetmap.org/copyright">
#                           OpenStreetMap</a>contributors ©
#                           <a href="http://cartodb.com/attributions#basemaps">
#                           CartoDB</a>'''
#                   )

# for index, feature_coll in boro_feature_colls.items():
#     GeoJson(feature_coll,
#         style_function = lambda feature: {
#             'fillColor': None,
#             'color': "green",
#             'weight': 0.5,
#             'fillOpacity': 0,
#         },
#         name = index
#     ).add_to(map_hex)
for index, feature_coll in boro_feature_colls.items():
    folium.Choropleth(
        geo_data=feature_coll,
        data=pu_cell_id_by_hr[0],
        key_on="feature.id",
        fill_opacity=0.8,
        line_weight=1,
    ).add_to(map_hex)

folium.LayerControl().add_to(map_hex)
map_hex.save(f'maps/map_hex{RESOLUTION}.html')

# Other Things to Visualize and Assume
- Road closures are caused by collisions
- Road closures could be inferred by seeing a common (low deviation) trip taking longer than usual for a given time