### Analyzing New York City taxi data using big data tools

#### 1. The NYC taxi data

In [1]:
import arcgis
from arcgis.gis import GIS

ago_gis = GIS() # Connect to ArcGIS Online as an anonymous user
search_subset = ago_gis.content.search("NYC_taxi_subset", item_type = "Feature Layer")
subset_item = search_subset[0]
subset_item

In [2]:
# Let us bring up a map to display the data.
subset_map = ago_gis.map("New York, NY", zoomlevel=11)
subset_map

MapView(layout=Layout(height='400px', width='100%'))

In [3]:
subset_map.add_layer(subset_item)

In [4]:
subset_feature_layer = subset_item.layers[0]

# query the attribute information. Limit to first 5 rows.
query_result = subset_feature_layer.query(where = 'OBJECTID < 5',
                                          out_fields = "*", 
                                          returnGeometry = False)

att_data_frame = query_result.sdf # get as a Pandas dataframe
att_data_frame

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,ObjectId,SHAPE
0,2,2015-01-02 19:06:46,2015-01-02 19:11:14,1,0.82,-74.005997,40.735241,1,N,-74.00972,40.72599,1,5.0,1,0.5,1.2,0,0.3,8.0,1,
1,2,2015-01-25 13:39:51,2015-01-26 13:24:58,6,5.62,-73.984688,40.748192,1,N,-74.016289,40.704849,2,19.5,0,0.5,0.0,0,0.3,20.3,2,
2,2,2015-01-13 09:10:38,2015-01-13 09:19:40,1,1.99,-73.967827,40.801315,1,N,-73.959618,40.782703,2,9.0,0,0.5,0.0,0,0.3,9.8,3,
3,1,2015-01-06 10:55:08,2015-01-06 11:05:48,1,1.6,-73.96756,40.801228,1,N,-73.95298,40.819118,2,9.0,0,0.5,0.0,0,0.3,9.8,4,


#### 2. Searching for big data file shares

In [5]:
# Let us connect to an ArcGIS Enterprise.

gis = GIS('https://pythonapi.playground.esri.com/portal', 'arcgis_python', 'amazing_arcgis_123')

In [6]:
# Ensure that the Geoanalytics is supported with GIS.

arcgis.geoanalytics.is_supported()

True

In [7]:
# Get the geoanalytics datastores and search it for the registered datasets:

datastores = arcgis.geoanalytics.get_datastores()

In [8]:
bigdata_fileshares = datastores.search(id='0e7a861d-c1c5-4acc-869d-05d2cebbdbee')
bigdata_fileshares

[<Datastore title:"/bigDataFileShares/GA_Data" type:"bigDataFileShare">]

In [9]:
# GA_Data is registered as a big data file share with the Geoanalytics datastore, so we can reference it:

data_item = bigdata_fileshares[0]

#### 3. Registering big data file shares

In [10]:
data_item.manifest

{'datasets': [{'name': 'air_quality', 'format': {'quoteChar': '"', 'fieldDelimiter': ',', 'hasHeaderRow': True, 'encoding': 'UTF-8', 'escapeChar': '"', 'recordTerminator': '\n', 'type': 'delimited', 'extension': 'csv'}, 'schema': {'fields': [{'name': 'State Code', 'type': 'esriFieldTypeBigInteger'}, {'name': 'County Code', 'type': 'esriFieldTypeBigInteger'}, {'name': 'Site Num', 'type': 'esriFieldTypeBigInteger'}, {'name': 'Parameter Code', 'type': 'esriFieldTypeBigInteger'}, {'name': 'POC', 'type': 'esriFieldTypeBigInteger'}, {'name': 'Latitude', 'type': 'esriFieldTypeDouble'}, {'name': 'Longitude', 'type': 'esriFieldTypeDouble'}, {'name': 'Datum', 'type': 'esriFieldTypeString'}, {'name': 'Parameter Name', 'type': 'esriFieldTypeString'}, {'name': 'Date Local', 'type': 'esriFieldTypeString'}, {'name': 'Time Local', 'type': 'esriFieldTypeString'}, {'name': 'Date GMT', 'type': 'esriFieldTypeString'}, {'name': 'Time GMT', 'type': 'esriFieldTypeString'}, {'name': 'Sample Measurement', 'typ

In [11]:
# Since this big data file share has multiple datasets, let's check the manifest for the taxi dataset.

data_item.manifest['datasets'][3]

{'name': 'analyze_new_york_city_taxi_data', 'format': {'quoteChar': '"', 'fieldDelimiter': ',', 'hasHeaderRow': True, 'encoding': 'UTF-8', 'escapeChar': '"', 'recordTerminator': '\n', 'type': 'delimited', 'extension': 'csv'}, 'schema': {'fields': [{'name': 'VendorID', 'type': 'esriFieldTypeBigInteger'}, {'name': 'tpep_pickup_datetime', 'type': 'esriFieldTypeString'}, {'name': 'tpep_dropoff_datetime', 'type': 'esriFieldTypeString'}, {'name': 'passenger_count', 'type': 'esriFieldTypeBigInteger'}, {'name': 'trip_distance', 'type': 'esriFieldTypeDouble'}, {'name': 'pickup_longitude', 'type': 'esriFieldTypeDouble'}, {'name': 'pickup_latitude', 'type': 'esriFieldTypeDouble'}, {'name': 'RateCodeID', 'type': 'esriFieldTypeBigInteger'}, {'name': 'store_and_fwd_flag', 'type': 'esriFieldTypeString'}, {'name': 'dropoff_longitude', 'type': 'esriFieldTypeDouble'}, {'name': 'dropoff_latitude', 'type': 'esriFieldTypeDouble'}, {'name': 'payment_type', 'type': 'esriFieldTypeBigInteger'}, {'name': 'fare_

#### 4. Performing data aggregation

In [12]:
search_result = gis.content.search("bigDataFileShares_GA_Data", item_type = "big data file share")
search_result

[<Item title:"bigDataFileShares_GA_Data" type:Big Data File Share owner:arcgis_python>, <Item title:"bigDataFileShares_GA_Data" type:Big Data File Share owner:arcgis_python>, <Item title:"bigDataFileShares_GA_Data" type:Big Data File Share owner:api_data_owner>]

In [13]:
data_item = search_result[0]
data_item

In [14]:
data_item.layers

[<Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_GA_Data/BigDataCatalogServer/air_quality">, <Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_GA_Data/BigDataCatalogServer/crime">, <Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_GA_Data/BigDataCatalogServer/calls">, <Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_GA_Data/BigDataCatalogServer/analyze_new_york_city_taxi_data">]

In [15]:
year_2015 = data_item.layers[3]
year_2015

<Layer url:"https://pythonapi.playground.esri.com/ga/rest/services/DataStoreCatalogs/bigDataFileShares_GA_Data/BigDataCatalogServer/analyze_new_york_city_taxi_data">

#### 5. Aggregate points tool

In [16]:
from arcgis.geoanalytics.summarize_data import aggregate_points

In [17]:
year_2015.properties['spatialReference']

{'wkid': 4326}

In [18]:
arcgis.env.process_spatial_reference=3857

In [19]:
arcgis.env.verbose = True

In [38]:
agg_result = aggregate_points(year_2015)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

#### 6. Inspect the results

In [22]:
# Create a map and load the processed result which is a 
processed_map = gis.map('New York, NY', 11)
processed_map

MapView(layout=Layout(height='400px', width='100%'))

In [36]:
processed_map.add_layer(agg_result)

RuntimeError: Cannot infer layer: will not be added to map

In [37]:
agg_result.share(org=True)

AttributeError: 'function' object has no attribute 'share'