In [93]:
import os

from geopandas import GeoDataFrame

from pyspark import SparkConf, SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, pandas_udf, PandasUDFType, udf, col, rank
from pyspark.sql.window import Window
from pyspark.sql.types import StringType

from sedona.utils import KryoSerializer, SedonaKryoRegistrator
from sedona.utils.adapter import Adapter
from sedona.register import SedonaRegistrator

### Create SparkSession

In [2]:
os.environ['PYSPARK_DRIVER_PYTHON'] = "/opt/conda/envs/venv36/bin/python"
spark = SparkSession.builder.master("spark://spark:7077"). \
    config("spark.serializer", KryoSerializer.getName). \
    config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
    config('spark.jars.packages',
           'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating,'
           'org.datasyslab:geotools-wrapper:geotools-24.0'). \
    appName('OpenSky_app'). \
    getOrCreate()

In [3]:
#Add spatial functionality to the SparkSession
SedonaRegistrator.registerAll(spark)

True

### Load states boundaries dataset

In [4]:
#Load data from local file to geoDataFrame
geo_admin_url = 'admin1-us.geojson'
gdf_states = GeoDataFrame.from_file(geo_admin_url)

#Create pySpark dataframe and view
spark_states_df = spark.createDataFrame(gdf_states)
spark_states_df.createOrReplaceTempView('states')

  aout[:] = out
  aout[:] = out


In [5]:
spark_states_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- ISO3166-1-Alpha-3: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- id: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [6]:
#Inspect view
spark.sql(
    """
    SELECT * FROM states ORDER BY name 
    """
).show(5)

+----------+--------------------+-----------------+----------+--------+--------------------+
|      name|             country|ISO3166-1-Alpha-3|state_code|      id|            geometry|
+----------+--------------------+-----------------+----------+--------+--------------------+
|   Alabama|United States of ...|              USA|        AL|USA-3541|POLYGON ((-85.054...|
|    Alaska|United States of ...|              USA|        AK|USA-3563|MULTIPOLYGON (((-...|
|   Arizona|United States of ...|              USA|        AZ|USA-3520|POLYGON ((-109.04...|
|  Arkansas|United States of ...|              USA|        AR|USA-3528|POLYGON ((-89.662...|
|California|United States of ...|              USA|        CA|USA-3521|POLYGON ((-114.35...|
+----------+--------------------+-----------------+----------+--------+--------------------+
only showing top 5 rows



### Create dataframe from data located localy

In [11]:
df=spark.read.csv('/opt/bitnami/spark/temp/states_2022-01-03-00.csv',inferSchema =True, header = True)

### Create UDF to transform lat/long to shapely geometry

In [12]:
@udf(returnType=StringType())
def create_WKT(lat, lon):
    wkt_point = f'POINT({lon} {lat})'
    return wkt_point

### Create UDF to transforn angle of heading into named direction

In [60]:
@udf(returnType=StringType())
def create_direction(angle):
    if 337.5 < angle or angle < 45: return "North"
    if 67.5 > angle > 22.5 : return "Northeast"
    if 112.5 > angle > 67.5 : return "East"
    if 157.5 > angle > 112.5 : return "Southeast"
    if 202.5 > angle > 157.5 : return "South"
    if 247.5 > angle > 202.5 : return "Southwest"
    if 292.5 > angle > 247.5 : return "West"
    if 337.5 > angle > 292.5 : return "Northwest"
    return None

### Discover data schema

In [13]:
df.printSchema()

root
 |-- time: integer (nullable = true)
 |-- icao24: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- velocity: double (nullable = true)
 |-- heading: double (nullable = true)
 |-- vertrate: double (nullable = true)
 |-- callsign: string (nullable = true)
 |-- onground: boolean (nullable = true)
 |-- alert: boolean (nullable = true)
 |-- spi: boolean (nullable = true)
 |-- squawk: integer (nullable = true)
 |-- baroaltitude: double (nullable = true)
 |-- geoaltitude: double (nullable = true)
 |-- lastposupdate: double (nullable = true)
 |-- lastcontact: double (nullable = true)



# A total number of flying vehicles in particular Monday.

To know that we need to count unique icao24 identifiers

In [10]:
unicue_vehicles = df.select('icao24').distinct().count()
print(f'Number of unique vehicles is {unicue_vehicles}')

Number of unique vehicles is 10508


### Modify DataFrame and add a column with WKT coordinates, shapely geometry and direction

In [61]:
#Filterout null data and vehicles ion the ground
df.dropna(subset=("icao24", "lat", "lon")). \
    filter(col('onground') == False). \
    withColumn('geometry_wkt', create_WKT(col('lat'), col('lon'))). \
    withColumn('heading_str', create_direction(col('heading'))). \
    createOrReplaceTempView('points')

In [62]:
merged_df = spark.sql(
    """
    SELECT *
    FROM (SELECT *, ST_GeomFromWKT(geometry_wkt) as geometry 
            FROM points) AS p, states as S
    WHERE ST_Intersects(p.geometry, s.geometry)
    """
)
merged_df.createOrReplaceTempView('merged_view')

In [63]:
merged_df.printSchema()

root
 |-- time: integer (nullable = true)
 |-- icao24: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- velocity: double (nullable = true)
 |-- heading: double (nullable = true)
 |-- vertrate: double (nullable = true)
 |-- callsign: string (nullable = true)
 |-- onground: boolean (nullable = true)
 |-- alert: boolean (nullable = true)
 |-- spi: boolean (nullable = true)
 |-- squawk: integer (nullable = true)
 |-- baroaltitude: double (nullable = true)
 |-- geoaltitude: double (nullable = true)
 |-- lastposupdate: double (nullable = true)
 |-- lastcontact: double (nullable = true)
 |-- geometry_wkt: string (nullable = true)
 |-- heading_str: string (nullable = true)
 |-- geometry: geometry (nullable = false)
 |-- name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- ISO3166-1-Alpha-3: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- id: string (nullable = true)
 |-- geometry: geometry (nullabl

### Find top values of vertical speed, speed of ascending and descending, and the highest flight in the all US teritory

In [59]:
#Faster ascending vehicle
TOP_values_df = spark.sql(
    """
    SELECT DISTINCT "fastest vertival" as nomination, icao24, name as state, mv.velocity as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT max(mv) as velocity
        FROM (SELECT icao24, max(velocity) as mv
                FROM merged_view
                GROUP BY icao24) as mv) as mx
                ON mv.velocity = mx.velocity
    UNION
    SELECT DISTINCT "fastest ascending" as nomination, icao24, name as state, mv.vertrate as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT max(mv) as vertrate
        FROM (SELECT icao24, max(vertrate) as mv
                FROM merged_view
                GROUP BY icao24) as mv) as mx
                ON mv.vertrate = mx.vertrate
    UNION
    SELECT DISTINCT "fastest descending" as nomination, icao24, name as state, mv.vertrate as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT min(mv) as vertrate
        FROM (SELECT icao24, min(vertrate) as mv
                FROM merged_view
                GROUP BY icao24) as mv) as mx
                ON mv.vertrate = mx.vertrate
    UNION
    SELECT DISTINCT "highest flight" as nomination, icao24, name as state, mv.geoaltitude as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT max(geoalt) as geoalt
        FROM (SELECT icao24, max(geoaltitude) as geoalt
                FROM merged_view
                GROUP BY icao24) as mv) as mx
                ON mv.geoaltitude = mx.geoalt
    """
)

TOP_values_df.write.csv('TOP_values.csv')

+------------------+------+--------------+------------------+
|        nomination|icao24|         state|             value|
+------------------+------+--------------+------------------+
|fastest descending|a92a7c|    California|-95.91040000000001|
| fastest ascending|a1e9a0|       Indiana|          165.8112|
|  fastest vertival|a43ca8|       Alabama| 554.6338601488377|
| fastest ascending|a585e4|     Louisiana|          165.8112|
|    highest flight|a8793d|South Carolina|38221.920000000006|
+------------------+------+--------------+------------------+



### Find the most common side of the world as direction of flight

In [66]:
#The most common direction
spark.sql(
    """
    SELECT heading_str, count (*) as quantity
    FROM merged_view
    WHERE heading_str IS NOT NULL
    GROUP BY heading_str
    ORDER BY quantity DESC
    """
).show()

+-----------+--------+
|heading_str|quantity|
+-----------+--------+
|       West|  240066|
|      North|  204990|
|       East|  185085|
|  Northwest|  151408|
|  Southwest|  127541|
|      South|  123995|
|  Southeast|  109286|
|  Northeast|   68357|
+-----------+--------+



### Find top values of vertical speed, speed of ascending and descending, and the highest flight in each state

In [68]:
#Fastest vehicle
spark.sql(
    """
    SELECT DISTINCT "fastest vertival" as nomination, name as state, icao24,  mv.velocity as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT state, max(mv) as velocity
        FROM (SELECT name as state, icao24, max(velocity) as mv
                FROM merged_view
                GROUP BY name, icao24) as mv
        GROUP BY state) as mx
    ON mv.velocity = mx.velocity and mv.name = mx.state
    """
).createOrReplaceTempView('fastest_State')

In [70]:
#Fastest ascending
spark.sql(
    """
    SELECT DISTINCT "fastest ascending" as nomination, name as state, icao24,  mv.vertrate as value
        FROM merged_view as mv
        INNER JOIN    
            (SELECT state, max(mv) as vertrate
            FROM (SELECT name as state, icao24, max(vertrate) as mv
                    FROM merged_view
                    GROUP BY name, icao24) as mv
            GROUP BY state) as mx
        ON mv.vertrate = mx.vertrate and mv.name = mx.state
    """
).createOrReplaceTempView('fastest_ASC')

In [71]:
#Fastest descending
spark.sql(
    """
    SELECT DISTINCT "fastest ascending" as nomination, name as state, icao24,  mv.vertrate as value
        FROM merged_view as mv
        INNER JOIN    
            (SELECT state, min(mv) as vertrate
            FROM (SELECT name as state, icao24, min(vertrate) as mv
                    FROM merged_view
                    GROUP BY name, icao24) as mv
            GROUP BY state) as mx
        ON mv.vertrate = mx.vertrate and mv.name = mx.state
    """
).createOrReplaceTempView('fastest_DESC')

In [73]:
#Highest flight
spark.sql(
    """    
    SELECT DISTINCT "highest flight" as nomination, name as state, icao24, mv.geoaltitude as value
    FROM merged_view as mv
    INNER JOIN    
        (SELECT state, max(geoalt) as geoalt
        FROM (SELECT name as state, icao24, max(geoaltitude) as geoalt
                FROM merged_view
                GROUP BY name, icao24) as mv
        GROUP BY state) as mx
                ON mv.geoaltitude = mx.geoalt and mv.name = mx.state
    """
).createOrReplaceTempView('highest')

In [87]:
state_result_df = spark.sql(
    """
    SELECT 
      fa.state, 
      fa.highest_altitude, 
      fa.highest_plane, 
      fa.fastest_desc, 
      fa.fastest_dc_plain, 
      fa.fastest_asc, 
      fa.fastest_asc_pain, 
      fs.value as highest_velocity, 
      fs.icao24 as fastest_plain 
    FROM 
      (
        SELECT 
          fd.state, 
          fd.highest_altitude, 
          fd.highest_plane, 
          fd.fastest_desc, 
          fd.fastest_dc_plain, 
          fa.value as fastest_asc, 
          fa.icao24 as fastest_asc_pain 
          FROM
              (
            SELECT 
              h.state as state, 
              h.highest_altitude, 
              h.highest_plane, 
              fd.value as fastest_desc, 
              fd.icao24 as fastest_dc_plain 
            FROM 
              (
                SELECT 
                  s.name as state, 
                  h.value as highest_altitude, 
                  h.icao24 as highest_plane 
                FROM 
                  states as s 
                  LEFT JOIN highest as h ON h.state = s.name 
                ORDER BY 
                  s.name
              ) as h 
              LEFT JOIN fastest_DESC as fd ON h.state = fd.state
          ) as fd 
          LEFT JOIN fastest_ASC as fa ON fd.state = fa.state
      ) as fa 
      LEFT JOIN fastest_State as fs ON fa.state = fs.state
    """
)

#Store results as csv
state_result_df.write.csv('TOP_values_by_states.csv')

### Find the most common direction by each state

In [98]:
#The most common direction
windowSpec = Window.partitionBy("state").orderBy("quantity")

spark.sql(
    """
        SELECT name as state, heading_str, count (*) as quantity
            FROM merged_view
            WHERE heading_str IS NOT NULL
            GROUP BY heading_str, name
            ORDER BY quantity DESC
    """
).withColumn("rank",rank().over(windowSpec)). \
    filter(col('rank') == 1). \
    select('state', 'heading_Str'). \
    show()

+--------------------+-----------+
|               state|heading_Str|
+--------------------+-----------+
|                Utah|  Northeast|
|           Minnesota|      South|
|                Ohio|  Southwest|
|            Arkansas|      South|
|              Oregon|  Northeast|
|               Texas|  Northeast|
|        North Dakota|  Northeast|
|        Pennsylvania|  Northeast|
|         Connecticut|  Southeast|
|            Nebraska|      North|
|             Vermont|      South|
|              Nevada|  Northeast|
|          Washington|  Northeast|
|            Illinois|  Northeast|
|            Oklahoma|  Northeast|
|District of Columbia|  Southeast|
|            Delaware|  Northwest|
|              Alaska|  Southeast|
|          New Mexico|      South|
|       West Virginia|  Southeast|
+--------------------+-----------+
only showing top 20 rows

