In [1]:
from pyspark import SparkConf, SparkFiles
from pyspark.sql import SparkSession
from geopandas import GeoDataFrame
import pandas as pd
from shapely import wkt
from pyspark.sql.functions import broadcast, pandas_udf, PandasUDFType, udf, col
from pyspark.sql.types import StringType
from shapely.geometry import Point, Polygon
import os
import json
import sys

from sedona.utils.adapter import Adapter
from sedona.register import SedonaRegistrator
from sedona.utils import KryoSerializer, SedonaKryoRegistrator

### Create SparkSession

In [2]:
#spark = SparkSession.builder.master('spark://spark:7077').config(conf=conf).appName('OpenSky_app').getOrCreate()
# os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"
# config("spark.archives","pyspark_conda_env.tar.gz#environment"). \

os.environ['PYSPARK_DRIVER_PYTHON'] = "/opt/conda/envs/venv36/bin/python"
spark = SparkSession.builder.master("spark://spark:7077"). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,'
           'org.datasyslab:geotools-wrapper:geotools-24.0'). \
    appName('OpenSky_app'). \
    getOrCreate()

In [3]:
SedonaRegistrator.registerAll(spark)

True

### Load states boundaries dataset

In [4]:
#Load data from local file to geoDataFrame
geo_admin_url = 'admin1-us.geojson'
gdf_states = GeoDataFrame.from_file(geo_admin_url)

#Create pySpark dataframe and view
spark_states_df = spark.createDataFrame(gdf_states)
spark_states_df.createOrReplaceTempView('states')

  aout[:] = out
  aout[:] = out


In [5]:
spark_states_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- ISO3166-1-Alpha-3: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- id: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [6]:
#Inspect view
spark.sql(
    """
    SELECT * FROM states ORDER BY name 
    """
).show(5)

+----------+--------------------+-----------------+----------+--------+--------------------+
|      name|             country|ISO3166-1-Alpha-3|state_code|      id|            geometry|
+----------+--------------------+-----------------+----------+--------+--------------------+
|   Alabama|United States of ...|              USA|        AL|USA-3541|POLYGON ((-85.054...|
|    Alaska|United States of ...|              USA|        AK|USA-3563|MULTIPOLYGON (((-...|
|   Arizona|United States of ...|              USA|        AZ|USA-3520|POLYGON ((-109.04...|
|  Arkansas|United States of ...|              USA|        AR|USA-3528|POLYGON ((-89.662...|
|California|United States of ...|              USA|        CA|USA-3521|POLYGON ((-114.35...|
+----------+--------------------+-----------------+----------+--------+--------------------+
only showing top 5 rows



### Create dataframe from data, localy located

In [7]:
df=spark.read.csv('/opt/bitnami/spark/temp/states_2022-01-03-00.csv',inferSchema =True, header = True)

### Create UDF to transform lat/long to shapely geometry

In [8]:
@udf(returnType=StringType())
def create_WKT(lat, lon):
    wkt_point = f'POINT({lon} {lat})'
    return wkt_point

### Discover data schema

In [9]:
df.printSchema()

root
 |-- time: integer (nullable = true)
 |-- icao24: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- velocity: double (nullable = true)
 |-- heading: double (nullable = true)
 |-- vertrate: double (nullable = true)
 |-- callsign: string (nullable = true)
 |-- onground: boolean (nullable = true)
 |-- alert: boolean (nullable = true)
 |-- spi: boolean (nullable = true)
 |-- squawk: integer (nullable = true)
 |-- baroaltitude: double (nullable = true)
 |-- geoaltitude: double (nullable = true)
 |-- lastposupdate: double (nullable = true)
 |-- lastcontact: double (nullable = true)



# A total number of flying vehicles in particular Monday.

To know that we need to count unique icao24 identifiers

In [None]:
unicue_vehicles = df.select('icao24').distinct().count()
print(f'Number of unique vehicles is {unicue_vehicles}')

### Modify DataFrame and add a column with WKT coordinates

In [10]:
df.dropna(). \
    withColumn('geometry_wkt', create_WKT(col('lat'), col('lon'))). \
    createOrReplaceTempView('points')

In [11]:
spark.sql(
    """
    SELECT *, ST_GeomFromWKT(geometry_wkt) as geometry FROM points
    """
).createOrReplaceTempView('points_geom')

In [12]:
spark.sql(
    """
    SELECT p.icao24, s.name
    FROM points_geom AS p, states as S
    WHERE ST_Intersects(p.geometry, s.geometry)
    """
).show(5)

+------+---------+
|icao24|     name|
+------+---------+
|a8df96|    Texas|
|abb1de|    Texas|
|a46796|  Florida|
|a86e6c|Louisiana|
|a70083|    Texas|
+------+---------+
only showing top 5 rows

