# Authors [add athors]
#Date

In [None]:
try:
    from google.colab import drive
except ModuleNotFoundError:
    IN_COLAB = False
else:
    drive.mount('/content/drive')
    IN_COLAB = True

Mounted at /content/drive


In [3]:
!pip install h3

Collecting h3
  Downloading h3-3.7.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h3
Successfully installed h3-3.7.7


In [None]:
if IN_COLAB:
    TRIPS_PATH = "/content/drive/MyDrive/Project/"
else:
    TRIPS_PATH = "../data/"
TRIPS_PATH += "NYC_Pilot2_PM_Part1.csv"

if IN_COLAB:
    POLYGON_PATH = "/content/drive/MyDrive/Project/"
else:
    POLYGON_PATH = "../data/"
POLYGON_PATH += "nyc_polygon.geojson"

In [1]:
TRIPS_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/master/data/NYC_Pilot2_PM_Part1.csv'
POLYGON_PATH = 'https://raw.githubusercontent.com/IsamAljawarneh/datasets/master/data/nyc_polygon.geojson'



In [4]:
import pandas as pd
import folium
import geopandas as gpd

import h3

In [5]:
sampling_fraction = 0.6
h3_resolution = 8

In [6]:
trips = pd.read_csv(TRIPS_PATH)

In [7]:
trips['h3'] = trips.apply(lambda x: h3.geo_to_h3(x.latitude, x.longitude, resolution = h3_resolution), axis=1)

In [8]:
trips_cut = trips[['time', 'h3']]

In [9]:
pickup_g = (trips_cut
                          .groupby('h3')
                          .time
                          .agg(list)
                          .to_frame("ids")
                          .reset_index())# Let's count each points inside the hexagon

pickup_g['count'] =(pickup_g['ids'].apply(lambda x:len(x)))

In [10]:
pickup_g.sort_values('count',ascending=False)

Unnamed: 0,h3,ids,count
10,882a100133fffff,"[1631277304, 1631277308, 1631277313, 163127731...",44746
51,882a100a97fffff,"[1632413438, 1632413443, 1632413448, 163241345...",18925
48,882a100a91fffff,"[1632424336, 1632424341, 1632424346, 163242435...",18174
53,882a100a9bfffff,"[1632408776, 1632408781, 1632408786, 163240879...",15074
52,882a100a99fffff,"[1632408721, 1632408726, 1632408731, 163240873...",11493
...,...,...,...
40,882a1001e9fffff,[1636853386],1
55,882a100aabfffff,[1635531404],1
3,882a10011dfffff,[1636855497],1
44,882a100a1bfffff,[1635530094],1


In [11]:
from shapely.geometry import Polygon
def add_geometry(row):
  points = h3.h3_to_geo_boundary(row['h3'], True)
  return Polygon(points)
  #Apply function into our dataframe
pickup_g['geometry'] = (pickup_g.apply(add_geometry,axis=1))

In [12]:
!pip install geojson

Collecting geojson
  Downloading geojson-3.1.0-py3-none-any.whl (15 kB)
Installing collected packages: geojson
Successfully installed geojson-3.1.0


In [13]:
from geojson import Feature, Point, FeatureCollection, Polygon

def hexagons_dataframe_to_geojson(df_hex, hex_id_field,geometry_field, value_field,file_output = None):

    list_features = []

    for i, row in df_hex.iterrows():
        feature = Feature(geometry = row[geometry_field],
                          id = row[hex_id_field],
                          properties = {"value": row[value_field]})
        list_features.append(feature)

    feat_collection = FeatureCollection(list_features)

    if file_output is not None:
        with open(file_output, "w") as f:
            json.dump(feat_collection, f)

    else :
      return feat_collection

In [14]:
geojson_obj = (hexagons_dataframe_to_geojson
                (pickup_g,
                 hex_id_field='h3',
                 value_field='count',
                 geometry_field='geometry'))

In [15]:
import plotly.express as px

fig = (px.choropleth_mapbox(
                    pickup_g,
                    geojson=geojson_obj,
                    locations='h3',
                    color='count',
                    color_continuous_scale="Viridis",
                    range_color=(0,pickup_g['count'].mean()),
                    mapbox_style='carto-positron',
                    zoom=10,
                    center = {"lat": 40.8, "lon": -73.83},
                    opacity=0.7,
                    labels={'count':'# of pickups '}))
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [16]:
# %%time
# convert to Geopandas Geodataframe
gdf_trips = gpd.GeoDataFrame(trips, geometry=gpd.points_from_xy(trips.longitude, trips.latitude))

In [17]:
# The CRS for trips should remain geographic 4326
gdf_trips.set_crs('epsg:4326', inplace=True)

Unnamed: 0,SensorID,time,latitude,longitude,bin0,bin1,bin2,bin3,bin4,bin5,...,bin19,bin20,bin21,bin22,bin23,temperature,humidity,pm25,h3,geometry
0,NYCP2_CS01A,1631277304,40.847672,-73.869316,11,1,1,0,0,0,...,0,0,0,0,0,23.7,57.3,4.508813,882a100133fffff,POINT (-73.86932 40.84767)
1,NYCP2_CS01A,1631277308,40.847668,-73.869316,22,4,1,0,0,2,...,0,0,0,0,0,23.7,57.8,5.462420,882a100133fffff,POINT (-73.86932 40.84767)
2,NYCP2_CS01A,1631277313,40.847649,-73.869362,40,1,1,0,0,1,...,0,0,0,0,0,23.7,57.8,5.154881,882a100133fffff,POINT (-73.86936 40.84765)
3,NYCP2_CS01A,1631277318,40.847649,-73.869362,26,1,0,0,0,0,...,0,0,0,0,0,23.6,57.6,4.508813,882a100133fffff,POINT (-73.86936 40.84765)
4,NYCP2_CS01A,1631277323,40.847649,-73.869362,44,4,0,1,0,0,...,0,0,0,0,0,23.6,57.5,5.539503,882a100133fffff,POINT (-73.86936 40.84765)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169994,NYCP2_CS03A,1631457109,40.823353,-73.890488,115,11,2,0,1,0,...,0,0,0,0,0,24.6,54.8,5.460360,882a100a99fffff,POINT (-73.89049 40.82335)
169995,NYCP2_CS03A,1631457114,40.823349,-73.890480,132,8,2,0,0,0,...,0,0,0,0,0,24.6,54.8,5.298209,882a100a99fffff,POINT (-73.89048 40.82335)
169996,NYCP2_CS03A,1631457119,40.823349,-73.890480,147,14,0,0,0,0,...,0,0,0,0,0,24.6,54.8,6.470661,882a100a99fffff,POINT (-73.89048 40.82335)
169997,NYCP2_CS03A,1631457124,40.823345,-73.890488,121,8,2,0,1,1,...,0,0,0,0,0,24.6,54.6,6.424142,882a100a99fffff,POINT (-73.89049 40.82335)


In [18]:
# BASELINE: original Neighbourhoods
geojson_file = POLYGON_PATH
neighborhoods_original = gpd.read_file(geojson_file)

In [19]:
# %%time

#BASELINE
# we join by sjoin, but we have geohash so, we sample stratified by geohash
# so, we join only to get the metrics but the stratified sampling is based on the fine-grained division (geohash in this case)
sjoined_trips_original = gpd.sjoin(gdf_trips, neighborhoods_original, predicate="within")
sjoined_trips_original.head(2)

Unnamed: 0,SensorID,time,latitude,longitude,bin0,bin1,bin2,bin3,bin4,bin5,...,temperature,humidity,pm25,h3,geometry,index_right,neighborhood,boroughCode,borough,@id
0,NYCP2_CS01A,1631277304,40.847672,-73.869316,11,1,1,0,0,0,...,23.7,57.3,4.508813,882a100133fffff,POINT (-73.86932 40.84767),38,Bronx Park,2,Bronx,http://nyc.pediacities.com/Resource/Neighborho...
1,NYCP2_CS01A,1631277308,40.847668,-73.869316,22,4,1,0,0,2,...,23.7,57.8,5.46242,882a100133fffff,POINT (-73.86932 40.84767),38,Bronx Park,2,Bronx,http://nyc.pediacities.com/Resource/Neighborho...


In [20]:
# %%time
# sampling by geohash
sampled_geohash_data = sjoined_trips_original.groupby('h3').apply(lambda x: x.sample(frac=sampling_fraction))

In [21]:
def generateBaseMap(default_location=[40.7306, -73.935], default_zoom_start=11):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [22]:
sampled_geohash_data_copy = sampled_geohash_data.sample(frac=1)
sampled_geohash_data_copy['count'] = 1

In [23]:
base_map = generateBaseMap()

In [24]:
from folium.plugins import HeatMap
HeatMap(data=sampled_geohash_data_copy[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(base_map)

<folium.plugins.heat_map.HeatMap at 0x7e72f9e96ec0>

In [None]:
base_map