In [None]:
from pyspark import SparkConf, SparkFiles
from pyspark.sql import SparkSession
from geopandas import GeoDataFrame
import pandas as pd
from shapely import wkt
from pyspark.sql.functions import broadcast, pandas_udf, PandasUDFType, udf, col
from pyspark.sql.types import StringType
from shapely.geometry import Point
import os
import json

### Pack enviromant with all dependencies 

In [None]:
!conda pack -f -o pyspark_conda_env.tar.gz

### Create SparkSession

In [None]:
#spark = SparkSession.builder.master('spark://spark:7077').config(conf=conf).appName('OpenSky_app').getOrCreate()
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"
spark = SparkSession.builder \
        .master('spark://spark:7077') \
        .config(
            "spark.archives","pyspark_conda_env.tar.gz#environment"
        ).appName('OpenSky_app') \
        .getOrCreate()

### Load states boundaries dataset

In [None]:
#Load data from local file to geoDataFrame
geo_admin_url = 'admin1-us.geojson'
gdf_states = GeoDataFrame.from_file(geo_admin_url)

#Convert coordinates to WKT - basicaly a string
gdf_states['wkt'] = pd.Series(
        map(lambda geom: str(geom.to_wkt()), gdf_states['geometry']),
        index=gdf_states.index, dtype='string')

#Create pySpark dataframe and broadcast it as a JSON
spark_states_df = spark.createDataFrame(gdf_states[['name','state_code','wkt']])
spark_states_df_JSON = spark_states_df.toJSON().collect()
states_bc = spark.sparkContext.broadcast(spark_states_df_JSON )

### Registrate UDF to get the state name to df

In [None]:
@udf(returnType=StringType())
def define_state(wkt_point):
    states = pd.DataFrame.from_records([json.loads(c) for c in states_bc.value])
    polygons = [wkt.loads(w) for w in states['wkt']]
    gdf = GeoDataFrame(states, geometry=polygons, crs='epsg:4326')
    intersected_poligons = gdf[gdf.contains(wkt.loads(wkt_point))]
    if not intersected_poligons.empty:
        return gdf['name'][0]
    else:
        return None

### Create dataframe from data, localy located

In [None]:
df=spark.read.csv('/opt/bitnami/spark/temp/states_2022-01-03-00.csv',inferSchema =True, header = True)

### Create UDF to transform lat/long to WKT

In [None]:
@udf(returnType=StringType())
def create_WKT(lat, lon):
    shapely_point = Point(lat, lon)
    return shapely_point.to_wkt()
    

### Discover data schema

In [None]:
df.printSchema()

### Discover first two data rows as a sample

In [None]:
%%time
df.show(2, vertical=True)

# A total number of flying vehicles in particular Monday.

To know that we need to count unique icao24 identifiers

In [None]:
unicue_vehicles = df.select('icao24').distinct().count()
print(f'Number of unique vehicles is {unicue_vehicles}')

### Modify DataFrame and add a column with WKT coordinates

In [None]:
df.dropna() \
.withColumn('point_wkt', create_WKT(col('lat'), col('lon'))) \
.withColumn('US_state', define_state(col('point_WKT'))) \
.collect()

In [None]:
df.dropna().withColumn('point_wkt', create_WKT(col('lat'), col('lon'))).sample(fraction=0.5).withColumn('US_state', define_state(col('point_WKT'))).collect()

In [None]:
df.printSchema()

In [None]:
df.cache()