In [75]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go


from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, lit

from pyspark.sql.functions import col, \
    from_unixtime, to_date, date_add, when, monotonically_increasing_id, window, mean, count
from pyspark.sql.types import TimestampType


In [76]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Mean Computer Generator") \
    .getOrCreate()

In [77]:
spark

In [78]:
flights_file_path = os.path.join('dataset', 'raw', '0d02a8')

read_flights = spark.read.parquet(flights_file_path, header=True)

In [79]:
def get_values_from_selected_columns(flight_data):

    #time_diff|      time|icao24|              lat|               lon|          velocity|           heading|          vertrate|callsign|onground|alert|  spi|squawk|      baroaltitude|       geoaltitude|   lastposupdate|     lastcontact|flight_number
    selected_cols = flight_data.select('time', 'icao24', 'lat', 'lon', 'velocity', 'heading', 'vertrate',  \
                                       'baroaltitude', 'geoaltitude', 'lastposupdate', 'lastcontact' , \
                                        'velocity', 'time_diff', 'dropout_length') \
                        .orderBy(asc('time'))
    
    selected_cols = selected_cols.withColumn("datetime", from_unixtime("time"))

    return selected_cols

In [80]:
flight_df = get_values_from_selected_columns(read_flights)

In [81]:
flight_df.show(3)

+----------+------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+--------------+-------------------+
|      time|icao24|              lat|               lon|          velocity|           heading|          vertrate|      baroaltitude|       geoaltitude|   lastposupdate|     lastcontact|          velocity|time_diff|dropout_length|           datetime|
+----------+------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+--------------+-------------------+
|1645229867|0d02a8|30.18530894134004|-95.45196533203125|145.94713719703776|173.72671649091603|12.679680000000001|3596.6400000000003|2926.0800000000004|1.645229866313E9|1.645229866313E9|145.94713719703776|     null|          null|2022-02-18 18:17:47|


In [82]:
flight_df.count()

8978

#### Preprocessing

##### 1. Duplicates

In [87]:
updated_flight_df = flight_df.groupBy(flight_df.columns)\
    .agg(count('*').alias("count"))\
        .filter(col("count") > 1)

updated_flight_df.show()

+----+------+---+---+--------+-------+--------+------------+-----------+-------------+-----------+--------+---------+--------------+--------+-----+
|time|icao24|lat|lon|velocity|heading|vertrate|baroaltitude|geoaltitude|lastposupdate|lastcontact|velocity|time_diff|dropout_length|datetime|count|
+----+------+---+---+--------+-------+--------+------------+-----------+-------------+-----------+--------+---------+--------------+--------+-----+
+----+------+---+---+--------+-------+--------+------------+-----------+-------------+-----------+--------+---------+--------------+--------+-----+



In [90]:
updated_flight_df = flight_df.dropDuplicates()

In [91]:
updated_flight_df.show()

+----------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+-------------------+-------------------+
|      time|icao24|               lat|               lon|          velocity|           heading|          vertrate|      baroaltitude|       geoaltitude|   lastposupdate|     lastcontact|          velocity|time_diff|     dropout_length|           datetime|
+----------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+-------------------+-------------------+
|1645230256|0d02a8|29.673560191009003|-95.53802490234375|183.21502537107276|173.71310663826355|           5.20192| 5928.360000000001| 6111.240000000001|1.645230255779E9|1.645230255942E9|183.21502537107276|      1.0|  1.0030000209808

##### 2. Drop NAs

In [92]:
updated_flight_df = updated_flight_df.na.drop().show()

+----------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+-------------------+-------------------+
|      time|icao24|               lat|               lon|          velocity|           heading|          vertrate|      baroaltitude|       geoaltitude|   lastposupdate|     lastcontact|          velocity|time_diff|     dropout_length|           datetime|
+----------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+----------------+------------------+---------+-------------------+-------------------+
|1645230256|0d02a8|29.673560191009003|-95.53802490234375|183.21502537107276|173.71310663826355|           5.20192| 5928.360000000001| 6111.240000000001|1.645230255779E9|1.645230255942E9|183.21502537107276|      1.0|  1.0030000209808

#### Calculating Mean

In [None]:
# Convert the timestamp column to a TimestampType
mean_calculated_df = flight_df.withColumn("datetime", flight_df["datetime"].cast(TimestampType()))

window_size = "5 minutes"

mean_calculated_df = mean_calculated_df.withColumn("window", window("datetime", window_size)).select('icao24', 'window', 'geoaltitude', 'velocity')

mean_calculated_df.show(5, truncate=False)

In [None]:
agg_df = mean_calculated_df.groupBy("window")\
        .agg(mean(flight_df['geoaltitude']).alias("mean_geoaltitude"),
             mean(flight_df['velocity']).alias("mean_velocity")).orderBy(asc('window'))


agg_df.show(truncate=False)

In [None]:
agg_df.count()