In [20]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, lit

from pyspark.sql.functions import col, \
    from_unixtime, to_date, date_add, when, monotonically_increasing_id, window, mean, count
from pyspark.sql.types import TimestampType


In [21]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Time SeriesGenerator") \
    .getOrCreate()

In [22]:
spark

In [23]:
# FLIGHT_ICAO = 'a7124d' #a7124d

FLIGHT_ICAO = 'a64ef7'

In [24]:
# flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO)

# flights_file_path = os.path.join('dataset', 'raw', 'batch-2', '*')

flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO , f'{FLIGHT_ICAO}_0.parquet')

read_flights = spark.read.parquet(flights_file_path, header=True)

In [25]:
def get_values_from_selected_columns(flight_data):

    selected_cols = flight_data.select('time', 'icao24', 'geoaltitude') \
                        .orderBy(asc('time'))
    
    selected_cols = selected_cols.withColumn("datetime", from_unixtime("time"))

    return selected_cols

In [26]:
flight_df = get_values_from_selected_columns(read_flights)

In [27]:
flight_df.show(3)

+----------+------+-----------+-------------------+
|      time|icao24|geoaltitude|           datetime|
+----------+------+-----------+-------------------+
|1645461468|a64ef7|    1059.18|2022-02-21 10:37:48|
|1645461469|a64ef7|    1059.18|2022-02-21 10:37:49|
|1645461470|a64ef7|    1059.18|2022-02-21 10:37:50|
+----------+------+-----------+-------------------+
only showing top 3 rows



In [28]:
flight_df.count()

1307

#### Preprocessing

##### 1. Duplicates

In [29]:
updated_flight_df = flight_df.groupBy(flight_df.columns)\
    .agg(count('*').alias("count"))\
        .filter(col("count") > 1)

updated_flight_df.show()

+----+------+-----------+--------+-----+
|time|icao24|geoaltitude|datetime|count|
+----+------+-----------+--------+-----+
+----+------+-----------+--------+-----+



In [30]:
updated_flight_df = flight_df.dropDuplicates()

In [31]:
updated_flight_df.show()

+----------+------+-----------------+-------------------+
|      time|icao24|      geoaltitude|           datetime|
+----------+------+-----------------+-------------------+
|1645461473|a64ef7|          1059.18|2022-02-21 10:37:53|
|1645461925|a64ef7|           1066.8|2022-02-21 10:45:25|
|1645462124|a64ef7|           1028.7|2022-02-21 10:48:44|
|1645462160|a64ef7|           1028.7|2022-02-21 10:49:20|
|1645462445|a64ef7|754.3800000000001|2022-02-21 10:54:05|
|1645462522|a64ef7|693.4200000000001|2022-02-21 10:55:22|
|1645462679|a64ef7|693.4200000000001|2022-02-21 10:57:59|
|1645461485|a64ef7|          1059.18|2022-02-21 10:38:05|
|1645461893|a64ef7|          1051.56|2022-02-21 10:44:53|
|1645462224|a64ef7|           998.22|2022-02-21 10:50:24|
|1645462317|a64ef7|967.7400000000001|2022-02-21 10:51:57|
|1645462452|a64ef7|754.3800000000001|2022-02-21 10:54:12|
|1645462530|a64ef7|693.4200000000001|2022-02-21 10:55:30|
|1645462609|a64ef7|693.4200000000001|2022-02-21 10:56:49|
|1645461691|a6

##### 2. Drop NAs

In [32]:
updated_flight_df = updated_flight_df.dropna(how='any', thresh=None, subset=None)

#### Calculating Mean

In [33]:
# Convert the timestamp column to a TimestampType
mean_calculated_df = flight_df.withColumn("datetime", flight_df["datetime"].cast(TimestampType()))

window_size = "2 minutes"

mean_calculated_df = mean_calculated_df.withColumn("window", window("datetime", window_size)).select('icao24', 'window', 'geoaltitude')

mean_calculated_df.show(5, truncate=False)

+------+------------------------------------------+-----------+
|icao24|window                                    |geoaltitude|
+------+------------------------------------------+-----------+
|a64ef7|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18    |
|a64ef7|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18    |
|a64ef7|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18    |
|a64ef7|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18    |
|a64ef7|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18    |
+------+------------------------------------------+-----------+
only showing top 5 rows



In [34]:
agg_df = mean_calculated_df.groupBy("window")\
        .agg(mean(flight_df['geoaltitude']).alias("mean_geoaltitude")).orderBy(asc('window'))


agg_df.show(truncate=False)

+------------------------------------------+------------------+
|window                                    |mean_geoaltitude  |
+------------------------------------------+------------------+
|{2022-02-21 10:36:00, 2022-02-21 10:38:00}|1059.18           |
|{2022-02-21 10:38:00, 2022-02-21 10:40:00}|1043.6225000000004|
|{2022-02-21 10:40:00, 2022-02-21 10:42:00}|1008.3800000000016|
|{2022-02-21 10:42:00, 2022-02-21 10:44:00}|1037.0184999999997|
|{2022-02-21 10:44:00, 2022-02-21 10:46:00}|1056.576499999998 |
|{2022-02-21 10:46:00, 2022-02-21 10:48:00}|1041.7174999999988|
|{2022-02-21 10:48:00, 2022-02-21 10:50:00}|1030.350999999998 |
|{2022-02-21 10:50:00, 2022-02-21 10:52:00}|990.8540000000024 |
|{2022-02-21 10:52:00, 2022-02-21 10:54:00}|851.9160000000006 |
|{2022-02-21 10:54:00, 2022-02-21 10:56:00}|711.199999999999  |
|{2022-02-21 10:56:00, 2022-02-21 10:58:00}|693.4199999999986 |
|{2022-02-21 10:58:00, 2022-02-21 11:00:00}|693.4199999999987 |
+---------------------------------------

In [35]:
agg_df.count()

12

In [36]:
agg_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'{FLIGHT_ICAO}_0.csv'))

In [37]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import col, expr

# window_size = 3
# horizon = 2

# # Create a window specification for ordering by an arbitrary column (you can use a timestamp column if available)
# window_spec = Window.orderBy(expr("monotonically_increasing_id()"))

# for i in range(window_size + horizon):
#     agg_df = agg_df.withColumn(f"t_plus_{1 * i}", expr(f"lead(mean_geoaltitude, {1 * i})").over(window_spec))

# selected_columns = [f"t_plus_{1 * i}" for i in range(window_size)]

# windowed_df = agg_df.select(*selected_columns).na.drop()

# windowed_df.show()

In [38]:
# windowed_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'batch-2_full_flights.csv'))