In [38]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, lit

from pyspark.sql.functions import col, \
    from_unixtime, to_date, date_add, when, monotonically_increasing_id, window, mean, count
from pyspark.sql.types import TimestampType


In [39]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Time SeriesGenerator") \
    .getOrCreate()

In [40]:
spark

In [41]:
# FLIGHT_ICAO = 'a7124d' #a7124d

#ad564c -> 6 trips
#adc0fb -> 8 trips

FLIGHT_ICAO = 'ad564c'

In [42]:
flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO)

# flights_file_path = os.path.join('dataset', 'raw', 'batch-2', '*')

# flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO , f'{FLIGHT_ICAO}_0.parquet')

read_flights = spark.read.parquet(flights_file_path, header=True)

In [43]:
def get_values_from_selected_columns(flight_data):

    selected_cols = flight_data.select('time', 'icao24', 'geoaltitude') \
                        .orderBy(asc('time'))
    
    selected_cols = selected_cols.withColumn("datetime", from_unixtime("time"))

    return selected_cols

In [44]:
flight_df = get_values_from_selected_columns(read_flights)

In [45]:
flight_df.show(3)

+----------+------+-----------+-------------------+
|      time|icao24|geoaltitude|           datetime|
+----------+------+-----------+-------------------+
|1645486903|ad564c|    1501.14|2022-02-21 17:41:43|
|1645486904|ad564c|    1501.14|2022-02-21 17:41:44|
|1645486905|ad564c|    1501.14|2022-02-21 17:41:45|
+----------+------+-----------+-------------------+
only showing top 3 rows



In [46]:
flight_df.count()

16412

#### Preprocessing

##### 1. Duplicates

In [47]:
updated_flight_df = flight_df.groupBy(flight_df.columns)\
    .agg(count('*').alias("count"))\
        .filter(col("count") > 1)

updated_flight_df.show()

+----+------+-----------+--------+-----+
|time|icao24|geoaltitude|datetime|count|
+----+------+-----------+--------+-----+
+----+------+-----------+--------+-----+



In [48]:
updated_flight_df = flight_df.dropDuplicates()

In [49]:
updated_flight_df.show()

+----------+------+------------------+-------------------+
|      time|icao24|       geoaltitude|           datetime|
+----------+------+------------------+-------------------+
|1645887498|ad564c|           2461.26|2022-02-26 08:58:18|
|1645887521|ad564c|           2461.26|2022-02-26 08:58:41|
|1645887547|ad564c|           2461.26|2022-02-26 08:59:07|
|1645887582|ad564c|           2461.26|2022-02-26 08:59:42|
|1645887599|ad564c|           2468.88|2022-02-26 08:59:59|
|1645887871|ad564c|           2468.88|2022-02-26 09:04:31|
|1645887988|ad564c|           2468.88|2022-02-26 09:06:28|
|1645888103|ad564c|           2468.88|2022-02-26 09:08:23|
|1645889321|ad564c|           2484.12|2022-02-26 09:28:41|
|1645889637|ad564c|           2484.12|2022-02-26 09:33:57|
|1645889710|ad564c|           2453.64|2022-02-26 09:35:10|
|1645890527|ad564c|2804.1600000000003|2022-02-26 09:48:47|
|1645890538|ad564c|2804.1600000000003|2022-02-26 09:48:58|
|1645890811|ad564c|2773.6800000000003|2022-02-26 09:53:3

##### 2. Drop NAs

In [50]:
updated_flight_df = updated_flight_df.dropna(how='any', thresh=None, subset=None)

#### Calculating Mean

In [51]:
# Convert the timestamp column to a TimestampType
# mean_calculated_df = flight_df.withColumn("datetime", flight_df["datetime"].cast(TimestampType()))

# window_size = "2 minutes"

# mean_calculated_df = mean_calculated_df.withColumn("window", window("datetime", window_size)).select('icao24', 'window', 'geoaltitude')

# mean_calculated_df.show(5, truncate=False)

In [52]:
# agg_df = mean_calculated_df.groupBy("window")\
#         .agg(mean(flight_df['geoaltitude']).alias("mean_geoaltitude")).orderBy(asc('window'))


# agg_df.show(truncate=False)

In [53]:
# agg_df.count()

In [54]:
# agg_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'{FLIGHT_ICAO}_0.csv'))

updated_flight_df = updated_flight_df.orderBy(asc('datetime'))
updated_flight_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'{FLIGHT_ICAO}.csv'))

In [55]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import col, expr

# window_size = 3
# horizon = 2

# # Create a window specification for ordering by an arbitrary column (you can use a timestamp column if available)
# window_spec = Window.orderBy(expr("monotonically_increasing_id()"))

# for i in range(window_size + horizon):
#     agg_df = agg_df.withColumn(f"t_plus_{1 * i}", expr(f"lead(mean_geoaltitude, {1 * i})").over(window_spec))

# selected_columns = [f"t_plus_{1 * i}" for i in range(window_size)]

# windowed_df = agg_df.select(*selected_columns).na.drop()

# windowed_df.show()

In [56]:
# windowed_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'batch-2_full_flights.csv'))