In [58]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, lit

from pyspark.sql.functions import col, \
    from_unixtime, to_date, date_add, when, monotonically_increasing_id, window, mean, count
from pyspark.sql.types import TimestampType


In [59]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Time SeriesGenerator") \
    .getOrCreate()

In [60]:
spark

In [61]:
FLIGHT_ICAO = 'a7124d' #a7124d

In [62]:
# flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO)

# flights_file_path = os.path.join('dataset', 'raw', 'batch-2', '*')

flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO , f'{FLIGHT_ICAO}_0.parquet')

read_flights = spark.read.parquet(flights_file_path, header=True)

In [63]:
def get_values_from_selected_columns(flight_data):

    selected_cols = flight_data.select('time', 'icao24', 'geoaltitude') \
                        .orderBy(asc('time'))
    
    selected_cols = selected_cols.withColumn("datetime", from_unixtime("time"))

    return selected_cols

In [64]:
flight_df = get_values_from_selected_columns(read_flights)

In [65]:
flight_df.show(3)

+----------+------+------------------+-------------------+
|      time|icao24|       geoaltitude|           datetime|
+----------+------+------------------+-------------------+
|1645384743|a7124d|              null|2022-02-20 13:19:03|
|1645384744|a7124d|1165.8600000000001|2022-02-20 13:19:04|
|1645384745|a7124d|1165.8600000000001|2022-02-20 13:19:05|
+----------+------+------------------+-------------------+
only showing top 3 rows



In [66]:
flight_df.count()

1086

#### Preprocessing

##### 1. Duplicates

In [67]:
updated_flight_df = flight_df.groupBy(flight_df.columns)\
    .agg(count('*').alias("count"))\
        .filter(col("count") > 1)

updated_flight_df.show()

+----+------+-----------+--------+-----+
|time|icao24|geoaltitude|datetime|count|
+----+------+-----------+--------+-----+
+----+------+-----------+--------+-----+



In [68]:
updated_flight_df = flight_df.dropDuplicates()

In [69]:
updated_flight_df.show()

+----------+------+------------------+-------------------+
|      time|icao24|       geoaltitude|           datetime|
+----------+------+------------------+-------------------+
|1645385038|a7124d|           1363.98|2022-02-20 13:23:58|
|1645385247|a7124d|1325.8799999999999|2022-02-20 13:27:27|
|1645385281|a7124d|           1386.84|2022-02-20 13:28:01|
|1645385330|a7124d|1341.1200000000001|2022-02-20 13:28:50|
|1645385679|a7124d|           1341.12|2022-02-20 13:34:39|
|1645385792|a7124d|           1341.12|2022-02-20 13:36:32|
|1645385801|a7124d|           1341.12|2022-02-20 13:36:41|
|1645384778|a7124d|           1234.44|2022-02-20 13:19:38|
|1645384889|a7124d|            1333.5|2022-02-20 13:21:29|
|1645384994|a7124d|           1386.84|2022-02-20 13:23:14|
|1645385067|a7124d|           1363.98|2022-02-20 13:24:27|
|1645384870|a7124d|           1341.12|2022-02-20 13:21:10|
|1645385329|a7124d|           1348.74|2022-02-20 13:28:49|
|1645385158|a7124d|           1363.98|2022-02-20 13:25:5

##### 2. Drop NAs

In [70]:
updated_flight_df = updated_flight_df.dropna(how='any', thresh=None, subset=None)

#### Calculating Mean

In [71]:
# Convert the timestamp column to a TimestampType
mean_calculated_df = flight_df.withColumn("datetime", flight_df["datetime"].cast(TimestampType()))

window_size = "2 minutes"

mean_calculated_df = mean_calculated_df.withColumn("window", window("datetime", window_size)).select('icao24', 'window', 'geoaltitude')

mean_calculated_df.show(5, truncate=False)

+------+------------------------------------------+------------------+
|icao24|window                                    |geoaltitude       |
+------+------------------------------------------+------------------+
|a7124d|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|null              |
|a7124d|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|1165.8600000000001|
|a7124d|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|1165.8600000000001|
|a7124d|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|1165.8600000000001|
|a7124d|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|1173.48           |
+------+------------------------------------------+------------------+
only showing top 5 rows



In [72]:
agg_df = mean_calculated_df.groupBy("window")\
        .agg(mean(flight_df['geoaltitude']).alias("mean_geoaltitude")).orderBy(asc('window'))


agg_df.show(truncate=False)

+------------------------------------------+------------------+
|window                                    |mean_geoaltitude  |
+------------------------------------------+------------------+
|{2022-02-20 13:18:00, 2022-02-20 13:20:00}|1214.1653571428567|
|{2022-02-20 13:20:00, 2022-02-20 13:22:00}|1320.863499999999 |
|{2022-02-20 13:22:00, 2022-02-20 13:24:00}|1361.1860000000008|
|{2022-02-20 13:24:00, 2022-02-20 13:26:00}|1368.425          |
|{2022-02-20 13:26:00, 2022-02-20 13:28:00}|1354.454999999999 |
|{2022-02-20 13:28:00, 2022-02-20 13:30:00}|1356.0424999999996|
|{2022-02-20 13:30:00, 2022-02-20 13:32:00}|1335.4684999999993|
|{2022-02-20 13:32:00, 2022-02-20 13:34:00}|1341.5644999999977|
|{2022-02-20 13:34:00, 2022-02-20 13:36:00}|1341.1199999999978|
|{2022-02-20 13:36:00, 2022-02-20 13:38:00}|1341.1199999999994|
+------------------------------------------+------------------+



In [73]:
agg_df.count()

10

In [74]:
agg_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'testing_flights{FLIGHT_ICAO}_0.csv'))

In [75]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import col, expr

# window_size = 3
# horizon = 2

# # Create a window specification for ordering by an arbitrary column (you can use a timestamp column if available)
# window_spec = Window.orderBy(expr("monotonically_increasing_id()"))

# for i in range(window_size + horizon):
#     agg_df = agg_df.withColumn(f"t_plus_{1 * i}", expr(f"lead(mean_geoaltitude, {1 * i})").over(window_spec))

# selected_columns = [f"t_plus_{1 * i}" for i in range(window_size)]

# windowed_df = agg_df.select(*selected_columns).na.drop()

# windowed_df.show()

In [76]:
# windowed_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'batch-2_full_flights.csv'))