# GeoMesa Testing

testing the geoprocessing capabilities of GeoMesa

In [1]:
%%init_spark
launcher.master = "spark://spark-master:7077"
launcher.num_executors = 2
launcher.executor_cores = 8
launcher.executor_memory = '16G'
launcher.packages = ["org.apache.hadoop:hadoop-aws:3.2.0",
                    "org.locationtech.geomesa:geomesa-spark-jts_2.12:3.2.0"]
launcher.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", 
                  "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")

In [2]:
// load geo mesa bits
import org.apache.spark.sql.types._
import org.locationtech.jts.geom._
import org.locationtech.geomesa.spark.jts._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.types._

Intitializing Scala interpreter ...

Spark Web UI available at http://7847ea2e55be:4040
SparkContext available as 'sc' (version = 3.1.1, master = spark://spark-master:7077, app id = app-20210531023343-0000)
SparkSession available as 'spark'


import org.apache.spark.sql.types._
import org.locationtech.jts.geom._
import org.locationtech.geomesa.spark.jts._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.types._


In [3]:
spark

res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6ab8cb77


In [4]:
spark.withJTS

res1: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6ab8cb77


## Loading Test Data

In [5]:
val green_trip_data_2015_h1 = "s3a://nyc-tlc/trip data/green_tripdata_2015-0[1-6].csv"
val green_trip_2015_h1 = spark.read.option("header", true).csv(green_trip_data_2015_h1)

green_trip_data_2015_h1: String = s3a://nyc-tlc/trip data/green_tripdata_2015-0[1-6].csv
green_trip_2015_h1: org.apache.spark.sql.DataFrame = [VendorID: string, lpep_pickup_datetime: string ... 19 more fields]


In [6]:
green_trip_2015_h1.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- Lpep_dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- Pickup_longitude: string (nullable = true)
 |-- Pickup_latitude: string (nullable = true)
 |-- Dropoff_longitude: string (nullable = true)
 |-- Dropoff_latitude: string (nullable = true)
 |-- Passenger_count: string (nullable = true)
 |-- Trip_distance: string (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- Total_amount: string (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- Trip_type : string (nullable = true)



In [23]:
val cleaned_green_trip_2015_h1 = green_trip_2015_h1
    .withColumn("pickup_longitude", col("Pickup_longitude").cast(DoubleType))
    .withColumn("pickup_latitude", col("Pickup_latitude").cast(DoubleType))
    .withColumn("pickup_point", st_makePoint(col("pickup_longitude"), col("pickup_latitude")))
    .withColumn("dropoff_longitude", col("Dropoff_longitude").cast(DoubleType))
    .withColumn("dropoff_latitude", col("Dropoff_latitude").cast(DoubleType))
    .withColumn("dropoff_point", st_makePoint(col("dropoff_longitude"), col("dropoff_latitude")))
    .withColumn("trip_distance", col("Trip_distance").cast(DoubleType) )
    .withColumn("passenger_count", col("Passenger_count").cast(IntegerType) )
    .withColumn("total_amount", col("Total_amount").cast(DoubleType) )


cleaned_green_trip_2015_h1: org.apache.spark.sql.DataFrame = [VendorID: string, lpep_pickup_datetime: string ... 21 more fields]


In [24]:
cleaned_green_trip_2015_h1.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- Lpep_dropoff_datetime: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- Fare_amount: string (nullable = true)
 |-- Extra: string (nullable = true)
 |-- MTA_tax: string (nullable = true)
 |-- Tip_amount: string (nullable = true)
 |-- Tolls_amount: string (nullable = true)
 |-- Ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- Trip_type : string (nullable = true)
 |-- pickup_point: point (nullable = true)
 

## Read in the Taxi Zones

Note that this file needs to be made available to all the executors

Get taxi zones from https://data.cityofnewyork.us/api/views/755u-8jsi/rows.csv?accessType=DOWNLOAD
and make them accessible

In [8]:
val taxi_zones = spark.read.option("header", true)
                    .option("inferSchema", "true")
                    .csv("/opt/spark-data/nyc_taxi_zones/taxi_zones.csv")
                    .withColumn("the_geom", st_geomFromWKT(col("the_geom")))

taxi_zones: org.apache.spark.sql.DataFrame = [OBJECTID: int, Shape_Leng: double ... 5 more fields]


In [25]:
taxi_zones.printSchema()

root
 |-- OBJECTID: integer (nullable = true)
 |-- Shape_Leng: double (nullable = true)
 |-- the_geom: geometry (nullable = true)
 |-- Shape_Area: double (nullable = true)
 |-- zone: string (nullable = true)
 |-- LocationID: integer (nullable = true)
 |-- borough: string (nullable = true)



In [30]:
//taxi_zones.write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("wkt_taxi_zones")

In [26]:
val merged = taxi_zones.join(cleaned_green_trip_2015_h1, st_contains(col("the_geom"), col("pickup_point")))

merged: org.apache.spark.sql.DataFrame = [OBJECTID: int, Shape_Leng: double ... 28 more fields]


In [29]:
val cleaned_merged = merged
            .select("zone","borough","pickup_point", "dropoff_point","lpep_pickup_datetime", "lpep_dropoff_datetime", "total_amount", "passenger_count", "trip_distance")
            .withColumnRenamed("zone", "pickup_zone")
            .withColumnRenamed("borough", "pickup_borough")

cleaned_merged: org.apache.spark.sql.DataFrame = [pickup_zone: string, pickup_borough: string ... 7 more fields]


In [30]:
cleaned_merged.select("lpep_pickup_datetime", "lpep_dropoff_datetime", "pickup_zone", "pickup_borough", "trip_distance", "passenger_count", "total_amount").take(1)

res14: Array[org.apache.spark.sql.Row] = Array([2015-01-01 00:34:42,2015-01-01 00:38:34,Astoria,Queens,0.88,1,6.3])


In [31]:
val drop_off_merged = taxi_zones.join(cleaned_merged, st_contains(col("the_geom"), col("dropoff_point")))

drop_off_merged: org.apache.spark.sql.DataFrame = [OBJECTID: int, Shape_Leng: double ... 14 more fields]


In [32]:
val final_merged = drop_off_merged
            .select("zone","borough","pickup_point", "pickup_zone", "pickup_borough", "dropoff_point","lpep_pickup_datetime", 
                    "lpep_dropoff_datetime", "total_amount", "passenger_count", "trip_distance")
            .withColumnRenamed("zone", "dropoff_zone")
            .withColumnRenamed("borough", "dropoff_borough")

final_merged: org.apache.spark.sql.DataFrame = [dropoff_zone: string, dropoff_borough: string ... 9 more fields]


In [34]:
final_merged.select("lpep_pickup_datetime", "lpep_dropoff_datetime", "pickup_zone", "pickup_borough", "dropoff_zone", "dropoff_borough", "trip_distance", "passenger_count", "total_amount").take(2)

res16: Array[org.apache.spark.sql.Row] = Array([2015-01-01 00:34:42,2015-01-01 00:38:34,Astoria,Queens,Astoria,Queens,0.88,1,6.3], [2015-01-01 00:34:46,2015-01-01 00:47:23,Crown Heights North,Brooklyn,Windsor Terrace,Brooklyn,3.08,1,13.3])


In [35]:
final_merged.write.format("parquet")
    .option("path", "/opt/spark-data/processed/taxi_processed")
    .mode(SaveMode.Overwrite).saveAsTable("taxi_processed")

org.apache.spark.SparkException:  Job aborted.

# Stopping Spark

In [None]:
spark.stop()

In [None]:
// scala test

In [8]:
val cards: List[String] = List("a", "b", "c")
val frames = List(1,2,3)

cards: List[String] = List(a, b, c)
frames: List[Int] = List(1, 2, 3)


In [12]:
val zip = cards.zip(frames)

zip: List[(String, Int)] = List((a,1), (b,2), (c,3))


In [13]:
for ( (stringy, inty) <- zip) {
    println(stringy)
    println(inty)
}

a
1
b
2
c
3
