In [0]:
# Dependencies to load moduels from this repo
import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

# Dependencies for time series features
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_timestamp, date_format
import pandas as pd
from prophet import Prophet

# Other Dependencies
import time

# Path for persistent storage
FOLDER_PATH = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/experiments"

## Load Training Data

In [0]:
# Load from data_loader and save snapshot (run once)
ts_data_path = f"{FOLDER_PATH}/timeseries_data_snapshot.parquet"

print("Loading from data_loader and saving snapshot...")
start = time.time()
data_loader = cv.FlightDelayDataLoader()
data_loader.load()
folds = data_loader.get_version("3M")

# Use final fold (test fold) which has 4 years of data for time series analysis
# This allows us to learn yearly seasonality without data leakage
train_df, test_df = folds[-1]
ts_data = test_df

# Check partition count and repartition if needed
num_partitions = ts_data.rdd.getNumPartitions()
if num_partitions > 500:
    ts_data = ts_data.coalesce(200)
elif num_partitions < 10:
    ts_data = ts_data.repartition(50)

# Save snapshot
ts_data.write.mode("overwrite").parquet(ts_data_path)
print(f"Saved snapshot in {time.time() - start:.2f} seconds")

print(f"\nTime series data: {ts_data.count():,} flights")
print(f"Date range: {ts_data.agg(F.min('FL_DATE'), F.max('FL_DATE')).collect()}")

In [0]:
# Load from saved snapshot (run this on subsequent runs, skip above cell
ts_data_path = f"{FOLDER_PATH}/timeseries_data_snapshot.parquet"

print(f"Loading timeseries data from {ts_data_path}...")
start = time.time()
ts_data = spark.read.parquet(ts_data_path)
ts_data.count()  # Materialize
print(f"Loaded in {time.time() - start:.2f} seconds")

print(f"\nTime series data: {ts_data.count():,} flights")
print(f"Date range: {ts_data.agg(F.min('FL_DATE'), F.max('FL_DATE')).collect()}")

## Generate Time-Series

In [0]:
# Prepare date column for aggregation
# Convert FL_DATE to date type and filter valid data
ts_data_prep = ts_data.withColumn(
    "date", 
    to_timestamp(col("FL_DATE"), "yyyy-MM-dd").cast("date")
).filter(
    col("date").isNotNull() & 
    col("DEP_DELAY").isNotNull()
)

### Global Time-Series

In [0]:
# lobal time series: Average departure delay by date
global_dep_delays_spark = (
    ts_data_prep
    .groupBy("date")
    .agg(
        F.avg("DEP_DELAY").alias("avg_dep_delay"),
        F.count("*").alias("flight_count")
    )
    .orderBy("date")
)

# Convert to pandas for Prophet
global_dep_delays = global_dep_delays_spark.toPandas()
global_dep_delays['ds'] = pd.to_datetime(global_dep_delays['date'])
global_dep_delays = global_dep_delays.rename(columns={'avg_dep_delay': 'y'})

print("Global time series (first 10 days):")
print(global_dep_delays[['ds', 'y', 'flight_count']].head(10))
print(f"\nTotal days: {len(global_dep_delays)}")


### Per Airport Time-Series

In [0]:
# Per-airport time series: Departure delays, arrival delays, and flight counts
# Aggregate by airport and date
per_airport_ts_spark = (
    ts_data_prep
    .groupBy("origin", "date")
    .agg(
        F.avg("DEP_DELAY").alias("avg_dep_delay"),
        F.avg("ARR_DELAY").alias("avg_arr_delay"),
        F.count("*").alias("flight_count")
    )
    .orderBy("origin", "date")
)

# Convert to pandas
per_airport_ts = per_airport_ts_spark.toPandas()
per_airport_ts['ds'] = pd.to_datetime(per_airport_ts['date'])

print(f"Per-airport time series: {len(per_airport_ts):,} rows")
print(f"Number of airports: {per_airport_ts['origin'].nunique()}")
print(f"Average days per airport: {len(per_airport_ts) / per_airport_ts['origin'].nunique():.1f}")
print("\nSample (first airport):")
first_airport = per_airport_ts['origin'].iloc[0]
print(per_airport_ts[per_airport_ts['origin'] == first_airport][['origin', 'ds', 'avg_dep_delay', 'avg_arr_delay', 'flight_count']].head(10))


## Time-Series EDA
