In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, lit

from pyspark.sql.functions import col, \
    from_unixtime, to_date, date_add, when, monotonically_increasing_id, window, mean, count
from pyspark.sql.types import TimestampType


In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Time SeriesGenerator") \
    .getOrCreate()

23/10/16 12:06:27 WARN Utils: Your hostname, Barathwajas-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.16.130.168 instead (on interface en0)
23/10/16 12:06:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/16 12:06:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

In [4]:
# FLIGHT_ICAO = 'a7124d' #a7124d

#ad564c -> 6 trips
#adc0fb -> 8 trips

FLIGHT_ICAO = 'adc0fb'

In [5]:
#flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO)

flights_file_path = os.path.join('dataset', 'raw', 'batch-2', '*')

#flights_file_path = os.path.join('dataset', 'raw', FLIGHT_ICAO , f'{FLIGHT_ICAO}_0.parquet')

read_flights = spark.read.parquet(flights_file_path, header=True)

                                                                                

In [6]:
def get_values_from_selected_columns(flight_data):

    selected_cols = flight_data.select('time', 'icao24', 'geoaltitude') \
                        .orderBy(asc('time'))
    
    selected_cols = selected_cols.withColumn("datetime", from_unixtime("time"))

    return selected_cols

In [7]:
flight_df = get_values_from_selected_columns(read_flights)

In [8]:
flight_df.show(3)

                                                                                

+----------+------+------------------+-------------------+
|      time|icao24|       geoaltitude|           datetime|
+----------+------+------------------+-------------------+
|1645279382|ac3aeb|220.98000000000002|2022-02-19 08:03:02|
|1645279383|ac3aeb|220.98000000000002|2022-02-19 08:03:03|
|1645279384|ac3aeb|228.60000000000002|2022-02-19 08:03:04|
+----------+------+------------------+-------------------+
only showing top 3 rows



In [9]:
flight_df.count()

93675

#### Preprocessing

##### 1. Duplicates

In [10]:
updated_flight_df = flight_df.groupBy(flight_df.columns)\
    .agg(count('*').alias("count"))\
        .filter(col("count") > 1)

updated_flight_df.show()



+----+------+-----------+--------+-----+
|time|icao24|geoaltitude|datetime|count|
+----+------+-----------+--------+-----+
+----+------+-----------+--------+-----+



                                                                                

In [11]:
updated_flight_df = flight_df.dropDuplicates()

In [12]:
updated_flight_df.show()

+----------+------+------------------+-------------------+
|      time|icao24|       geoaltitude|           datetime|
+----------+------+------------------+-------------------+
|1645887498|ad564c|           2461.26|2022-02-26 08:58:18|
|1645887521|ad564c|           2461.26|2022-02-26 08:58:41|
|1645887547|ad564c|           2461.26|2022-02-26 08:59:07|
|1645887582|ad564c|           2461.26|2022-02-26 08:59:42|
|1645887599|ad564c|           2468.88|2022-02-26 08:59:59|
|1645887871|ad564c|           2468.88|2022-02-26 09:04:31|
|1645887988|ad564c|           2468.88|2022-02-26 09:06:28|
|1645888103|ad564c|           2468.88|2022-02-26 09:08:23|
|1645889321|ad564c|           2484.12|2022-02-26 09:28:41|
|1645889637|ad564c|           2484.12|2022-02-26 09:33:57|
|1645889710|ad564c|           2453.64|2022-02-26 09:35:10|
|1645890527|ad564c|2804.1600000000003|2022-02-26 09:48:47|
|1645890538|ad564c|2804.1600000000003|2022-02-26 09:48:58|
|1645890811|ad564c|2773.6800000000003|2022-02-26 09:53:3

##### 2. Drop NAs

In [13]:
updated_flight_df = updated_flight_df.dropna(how='any', thresh=None, subset=None)

#### Calculating Mean

In [14]:
# Convert the timestamp column to a TimestampType
# mean_calculated_df = flight_df.withColumn("datetime", flight_df["datetime"].cast(TimestampType()))

# window_size = "2 minutes"

# mean_calculated_df = mean_calculated_df.withColumn("window", window("datetime", window_size)).select('icao24', 'window', 'geoaltitude')

# mean_calculated_df.show(5, truncate=False)

In [15]:
# agg_df = mean_calculated_df.groupBy("window")\
#         .agg(mean(flight_df['geoaltitude']).alias("mean_geoaltitude")).orderBy(asc('window'))


# agg_df.show(truncate=False)

In [16]:
# agg_df.count()

In [17]:
# agg_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'{FLIGHT_ICAO}_0.csv'))

updated_flight_df = updated_flight_df.orderBy(asc('datetime'))
#updated_flight_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'{FLIGHT_ICAO}_0.csv'))
updated_flight_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'batch-2.csv'))

In [None]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import col, expr

# window_size = 3
# horizon = 2

# # Create a window specification for ordering by an arbitrary column (you can use a timestamp column if available)
# window_spec = Window.orderBy(expr("monotonically_increasing_id()"))

# for i in range(window_size + horizon):
#     agg_df = agg_df.withColumn(f"t_plus_{1 * i}", expr(f"lead(mean_geoaltitude, {1 * i})").over(window_spec))

# selected_columns = [f"t_plus_{1 * i}" for i in range(window_size)]

# windowed_df = agg_df.select(*selected_columns).na.drop()

# windowed_df.show()

In [None]:
# windowed_df.toPandas().to_csv(os.path.join('dataset', 'processed', 'geoaltitude', f'batch-2_full_flights.csv'))