## Setup Spark Context and Configuration

In [1]:
import geomesa_pyspark
conf = geomesa_pyspark.configure(
    jars=['/usr/lib/spark/jars/geomesa-hbase-spark-runtime_2.11-2.1.0-m.2.jar'],
    packages=['geomesa_pyspark','pytz'],
    spark_home='/usr/lib/spark/').\
    setAppName('MyTestApp')

conf.get('spark.master')
# u'yarn'

from pyspark.sql import SparkSession

spark = ( SparkSession
    .builder
    .config(conf=conf)
    .getOrCreate()
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1599127947832_0002,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


## Connect to GeoMesa and retreive a Dataframe backed by a GeoMesa Table

In [7]:
params = {
    "hbase.zookeepers": "hbase.optix-ons-local:2181",
    "hbase.catalog": "ons-historical"
}
feature = "ee"
adsbx = ( spark
    .read
    .format("geomesa")
    .options(**params)
    .option("geomesa.feature", feature)
    .load()
)

adsbx.createOrReplaceTempView("ee")

### Number of planes that visited Dubai over time period

In [8]:
spark.sql("""
select 
    count(distinct Icao) as num_planes
from adsbx 
where st_contains(st_bufferPoint(st_makePoint(55.31, 25.26), 50000), geom) 
    and dtg > cast('2018-10-13' as timestamp) 
    and dtg < cast('2018-10-20' as timestamp)
""").show()

"cannot resolve '`geom`' given input columns: [adsbx.position, adsbx.sog, adsbx.flag_country, adsbx.heading, adsbx.message_type, adsbx.nav_status_code, adsbx.imo, adsbx.width, adsbx.dt_static_utc, adsbx.ts_static_utc, adsbx.length, adsbx.nav_status, adsbx.eeid, adsbx.rot, adsbx.vessel_type_main, adsbx.vessel_type_sub, adsbx.flag_code, adsbx.vessel_class, adsbx.latitude, adsbx.dt_pos_utc, adsbx.callsign, adsbx.ts_pos_utc, adsbx.cog, adsbx.source, adsbx.vessel_name, adsbx.__fid__, adsbx.mmsi, adsbx.destination, adsbx.vessel_type_cargo, adsbx.dtg, adsbx.vessel_type_code, adsbx.eta, adsbx.vessel_type, adsbx.draught, adsbx.longitude]; line 5 pos 69;\n'Project ['count('Icao) AS num_planes#283]\n+- 'Filter (('st_contains(UDF:st_bufferPoint(UDF:st_makePoint(cast(55.31 as double), cast(25.26 as double)), cast(50000 as double)), 'geom) && (dtg#176 > cast(2018-10-13 as timestamp))) && (dtg#176 < cast(2018-10-20 as timestamp)))\n   +- SubqueryAlias adsbx\n      +- Relation[__fid__#142,mmsi#143L,im

### Query for the distinct planes around the Dubai Airport and group by day

In [4]:
spark.sql("""
select 
    num_planes,
    date_format(dtg_sub, "YYYY-MM-dd") as day
from (
    select 
        count(distinct Icao) as num_planes, 
        /* Convert to 00:00:00 */
        date_sub(dtg, 0) as dtg_sub
    FROM (
        select Icao, dtg
        from adsbx 
        where st_contains(st_bufferPoint(st_makePoint(55.31, 25.26), 50000), geom) 
            and dtg > cast('2018-10-13' as timestamp) 
            and dtg < cast('2018-10-20' as timestamp)
        )
    group by
        dtg_sub
    order by 
        dtg_sub
)
""").show()

+----------+----------+
|num_planes|       day|
+----------+----------+
|       663|2018-10-13|
|       506|2018-10-14|
|       576|2018-10-15|
|       646|2018-10-16|
|       662|2018-10-17|
|       655|2018-10-18|
|       663|2018-10-19|
+----------+----------+