In [0]:
# Dependencies to load moduels from this repo
import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

# Dependencies for time series features
from pyspark.sql import functions as F
from pyspark.sql.functions import col, to_timestamp, date_format, when
import pandas as pd
import numpy as np
from prophet import Prophet

# Dependencies for EDA
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller, kpss, coint
from statsmodels.tsa.seasonal import STL
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox

# Other Dependencies
import time

# Path for persistent storage
FOLDER_PATH = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/experiments"

## Load Training Data

In [0]:
# Load from data_loader and save snapshot (run once)
ts_data_path = f"{FOLDER_PATH}/timeseries_data_snapshot.parquet"

print("Loading from data_loader and saving snapshot...")
start = time.time()
data_loader = cv.FlightDelayDataLoader()
data_loader.load()
folds = data_loader.get_version("60M")

# Use final fold and union training and validation to get 2 years of data for time series analysis
# This allows us to learn yearly seasonality without data leakage
train_df, val_df = folds[-1]

# Union training and validation folds to get 2-year time period
ts_data = train_df.union(val_df)

# Check partition count and repartition if needed
num_partitions = ts_data.rdd.getNumPartitions()
if num_partitions > 500:
    ts_data = ts_data.coalesce(200)
elif num_partitions < 10:
    ts_data = ts_data.repartition(50)

# Save snapshot
ts_data.write.mode("overwrite").parquet(ts_data_path)
print(f"Saved snapshot in {time.time() - start:.2f} seconds")

print(f"\nTime series data: {ts_data.count():,} flights")
print(f"Date range: {ts_data.agg(F.min('FL_DATE'), F.max('FL_DATE')).collect()}")

In [0]:
# Load from saved snapshot (run this on subsequent runs, skip above cell
ts_data_path = f"{FOLDER_PATH}/timeseries_data_snapshot.parquet"

print(f"Loading timeseries data from {ts_data_path}...")
start = time.time()
ts_data = spark.read.parquet(ts_data_path)
ts_data.count()  # Materialize
print(f"Loaded in {time.time() - start:.2f} seconds")

print(f"\nTime series data: {ts_data.count():,} flights")
print(f"Date range: {ts_data.agg(F.min('FL_DATE'), F.max('FL_DATE')).collect()}")

## Generate Time-Series

In [0]:
# Prepare date column for aggregation
# Convert FL_DATE to date type and filter valid data
ts_data_prep = ts_data.withColumn(
    "date", 
    to_timestamp(col("FL_DATE"), "yyyy-MM-dd").cast("date")
).filter(
    col("date").isNotNull() & 
    col("DEP_DELAY").isNotNull()
)

### Global Time-Series

In [0]:
# lobal time series: Average departure delay by date
global_dep_delays_spark = (
    ts_data_prep
    .groupBy("date")
    .agg(
        F.avg("DEP_DELAY").alias("avg_dep_delay"),
        F.count("*").alias("flight_count")
    )
    .orderBy("date")
)

# Convert to pandas for Prophet
global_dep_delays = global_dep_delays_spark.toPandas()
global_dep_delays['ds'] = pd.to_datetime(global_dep_delays['date'])
global_dep_delays = global_dep_delays.rename(columns={'avg_dep_delay': 'y'})

print("Global time series (first 10 days):")
print(global_dep_delays[['ds', 'y', 'flight_count']].head(10))
print(f"\nTotal days: {len(global_dep_delays)}")


### Per Airport Time-Series

In [0]:
# Per-airport time series: Departure delays, arrival delays, and flight counts
# Aggregate by airport and date
per_airport_ts_spark = (
    ts_data_prep
    .groupBy("origin", "date")
    .agg(
        F.avg("DEP_DELAY").alias("avg_dep_delay"),
        F.avg("ARR_DELAY").alias("avg_arr_delay"),
        F.count("*").alias("flight_count")
    )
    .orderBy("origin", "date")
)

# Convert to pandas
per_airport_ts = per_airport_ts_spark.toPandas()
per_airport_ts['ds'] = pd.to_datetime(per_airport_ts['date'])

print(f"Per-airport time series: {len(per_airport_ts):,} rows")
print(f"Number of airports: {per_airport_ts['origin'].nunique()}")
print(f"Average days per airport: {len(per_airport_ts) / per_airport_ts['origin'].nunique():.1f}")
print("\nSample (first airport):")
first_airport = per_airport_ts['origin'].iloc[0]
print(per_airport_ts[per_airport_ts['origin'] == first_airport][['origin', 'ds', 'avg_dep_delay', 'avg_arr_delay', 'flight_count']].head(10))


## Time-Series EDA


In [0]:
import matplotlib.pyplot as plt
### 1. Raw Time Series Plots


# Global departure delays
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Full time series
axes[0].plot(global_dep_delays['ds'], global_dep_delays['y'], linewidth=0.5, alpha=0.7)
axes[0].set_title('Global Average Departure Delay Over Time (Full Series)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Average Departure Delay (minutes)')
axes[0].grid(True, alpha=0.3)

# Zoomed view (last 6 months)
last_6mo = global_dep_delays.tail(180)
axes[1].plot(last_6mo['ds'], last_6mo['y'], linewidth=1, marker='o', markersize=2)
axes[1].set_title('Global Average Departure Delay (Last 6 Months)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Average Departure Delay (minutes)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print("\n=== Global Departure Delay Summary Statistics ===")
print(global_dep_delays['y'].describe())


In [0]:
### 2. Distribution and Box Plots

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Histogram
axes[0, 0].hist(global_dep_delays['y'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Daily Average Departure Delays', fontweight='bold')
axes[0, 0].set_xlabel('Average Departure Delay (minutes)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(global_dep_delays['y'].mean(), color='r', linestyle='--', label=f'Mean: {global_dep_delays["y"].mean():.2f}')
axes[0, 0].legend()

# Box plot by year
global_dep_delays['year'] = global_dep_delays['ds'].dt.year
sns.boxplot(data=global_dep_delays, x='year', y='y', ax=axes[0, 1])
axes[0, 1].set_title('Departure Delays by Year', fontweight='bold')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average Departure Delay (minutes)')

# Box plot by month
global_dep_delays['month'] = global_dep_delays['ds'].dt.month
sns.boxplot(data=global_dep_delays, x='month', y='y', ax=axes[1, 0])
axes[1, 0].set_title('Departure Delays by Month', fontweight='bold')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Average Departure Delay (minutes)')

# Box plot by day of week
global_dep_delays['day_of_week'] = global_dep_delays['ds'].dt.dayofweek
sns.boxplot(data=global_dep_delays, x='day_of_week', y='y', ax=axes[1, 1])
axes[1, 1].set_title('Departure Delays by Day of Week', fontweight='bold')
axes[1, 1].set_xlabel('Day of Week (0=Monday)')
axes[1, 1].set_ylabel('Average Departure Delay (minutes)')

plt.tight_layout()
plt.show()

In [0]:
### 3. Stationarity Tests

# Set up time series for testing (remove NaN values)

ts_values = global_dep_delays['y'].dropna().values

print("=== Stationarity Tests ===\n")

# Augmented Dickey-Fuller Test (ADF)
print("1. Augmented Dickey-Fuller Test (ADF):")
print("   H0: Series has a unit root (non-stationary)")
print("   H1: Series is stationary\n")
adf_result = adfuller(ts_values)
print(f"   ADF Statistic: {adf_result[0]:.4f}")
print(f"   p-value: {adf_result[1]:.4f}")
print(f"   Critical Values:")
for key, value in adf_result[4].items():
    print(f"      {key}: {value:.4f}")
if adf_result[1] <= 0.05:
    print("   ✓ Series is STATIONARY (reject H0, p < 0.05)")
else:
    print("   ✗ Series is NON-STATIONARY (fail to reject H0, p >= 0.05)")

print("\n" + "="*60 + "\n")

# KPSS Test
print("2. KPSS Test:")
print("   H0: Series is stationary")
print("   H1: Series has a unit root (non-stationary)\n")
kpss_result = kpss(ts_values, regression='ct')  # 'ct' = constant and trend
print(f"   KPSS Statistic: {kpss_result[0]:.4f}")
print(f"   p-value: {kpss_result[1]:.4f}")
print(f"   Critical Values:")
for key, value in kpss_result[3].items():
    print(f"      {key}: {value:.4f}")
if kpss_result[1] >= 0.05:
    print("   ✓ Series is STATIONARY (fail to reject H0, p >= 0.05)")
else:
    print("   ✗ Series is NON-STATIONARY (reject H0, p < 0.05)")

In [0]:
### 4. Autocorrelation Analysis

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# ACF (Autocorrelation Function)
plot_acf(global_dep_delays['y'].dropna(), lags=50, ax=axes[0], alpha=0.05)
axes[0].set_title('Autocorrelation Function (ACF)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Lag')
axes[0].set_ylabel('Autocorrelation')

# PACF (Partial Autocorrelation Function)
plot_pacf(global_dep_delays['y'].dropna(), lags=50, ax=axes[1], alpha=0.05)
axes[1].set_title('Partial Autocorrelation Function (PACF)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Lag')
axes[1].set_ylabel('Partial Autocorrelation')

plt.tight_layout()
plt.show()

# Ljung-Box Test for autocorrelation
print("\n=== Ljung-Box Test for Autocorrelation ===")
print("H0: No autocorrelation")
print("H1: Autocorrelation exists\n")
lb_result = acorr_ljungbox(global_dep_delays['y'].dropna(), lags=10, return_df=True)
print(lb_result)
if (lb_result['lb_pvalue'] < 0.05).any():
    print("\n✗ Significant autocorrelation detected (p < 0.05)")
else:
    print("\n✓ No significant autocorrelation (p >= 0.05)")

In [0]:
### 5. STL Decomposition (Seasonal and Trend decomposition using Loess)

# Set date as index for STL
ts_indexed = global_dep_delays.set_index('ds')['y'].dropna()

# STL Decomposition
# period=365 for yearly seasonality, but we can also try weekly (period=7)
stl = STL(ts_indexed, seasonal=365, trend=None, robust=True)
decomposition = stl.fit()

fig, axes = plt.subplots(4, 1, figsize=(14, 12))

# Original
axes[0].plot(ts_indexed.index, ts_indexed.values, linewidth=0.5, alpha=0.7)
axes[0].set_title('Original Time Series', fontweight='bold')
axes[0].set_ylabel('Departure Delay')
axes[0].grid(True, alpha=0.3)

# Trend
axes[1].plot(decomposition.trend.index, decomposition.trend.values, linewidth=1, color='blue')
axes[1].set_title('Trend Component', fontweight='bold')
axes[1].set_ylabel('Trend')
axes[1].grid(True, alpha=0.3)

# Seasonal
axes[2].plot(decomposition.seasonal.index, decomposition.seasonal.values, linewidth=0.5, alpha=0.7, color='green')
axes[2].set_title('Seasonal Component', fontweight='bold')
axes[2].set_ylabel('Seasonal')
axes[2].grid(True, alpha=0.3)

# Residual
axes[3].plot(decomposition.resid.index, decomposition.resid.values, linewidth=0.5, alpha=0.7, color='red')
axes[3].set_title('Residual Component', fontweight='bold')
axes[3].set_ylabel('Residual')
axes[3].set_xlabel('Date')
axes[3].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary of decomposition
print("\n=== STL Decomposition Summary ===")
print(f"Trend variance: {decomposition.trend.var():.4f}")
print(f"Seasonal variance: {decomposition.seasonal.var():.4f}")
print(f"Residual variance: {decomposition.resid.var():.4f}")
print(f"\nResidual statistics:")
print(decomposition.resid.describe())

In [0]:
### 6. Additional Statistical Analysis

# Monthly averages
monthly_avg = global_dep_delays.groupby('month')['y'].mean()
print("=== Monthly Average Departure Delays ===")
for month, avg in monthly_avg.items():
    month_name = pd.Timestamp(2020, month, 1).strftime('%B')
    print(f"{month_name:12s}: {avg:6.2f} minutes")

# Day of week averages
dow_avg = global_dep_delays.groupby('day_of_week')['y'].mean()
dow_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print("\n=== Day of Week Average Departure Delays ===")
for dow, avg in dow_avg.items():
    print(f"{dow_names[dow]:12s}: {avg:6.2f} minutes")

# Year-over-year comparison
print("\n=== Year-over-Year Comparison ===")
yearly_avg = global_dep_delays.groupby('year')['y'].agg(['mean', 'std', 'min', 'max'])
print(yearly_avg)

**TODO**: Additional Time Series Analysis

- [ ] **Delays by Airplane Model**: Time series of average departure delays by aircraft model (e.g., Boeing 737, Airbus A320)
- [ ] **Delays by Carrier**: Time series of average departure delays by airline carrier
- [ ] Apply same EDA (stationarity tests, STL decomposition, etc.) to these additional time series

In [0]:
### 7. Cointegration Test: Flight Count vs Average Delay

# Test if number of flights and average delay are cointegrated
# Cointegration means they have a long-term equilibrium relationship
# even if individually non-stationary

# Align the series (same dates)
flight_count_series = global_dep_delays['flight_count'].dropna().values
delay_series = global_dep_delays['y'].dropna().values

# Ensure same length
min_len = min(len(flight_count_series), len(delay_series))
flight_count_series = flight_count_series[:min_len]
delay_series = delay_series[:min_len]

print("=== Cointegration Test: Flight Count vs Average Delay ===\n")
print("H0: No cointegration (series are not in long-term equilibrium)")
print("H1: Cointegration exists (series have long-term relationship)\n")

# Engle-Granger cointegration test
coint_result = coint(delay_series, flight_count_series)

print(f"Cointegration Test Statistic: {coint_result[0]:.4f}")
print(f"p-value: {coint_result[1]:.4f}")
print(f"Critical Values:")
for key, value in coint_result[2].items():
    print(f"   {key}: {value:.4f}")

if coint_result[1] <= 0.05:
    print("\n✓ COINTEGRATION DETECTED (p < 0.05)")
    print("  → Flight count and delay have a long-term equilibrium relationship")
    print("  → They move together over time despite short-term deviations")
else:
    print("\n✗ No cointegration (p >= 0.05)")
    print("  → Flight count and delay do not have a stable long-term relationship")

# Visualize the relationship
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Plot both series
ax1 = axes[0]
ax1_twin = ax1.twinx()
ax1.plot(global_dep_delays['ds'].iloc[:min_len], delay_series, 'b-', label='Avg Delay', linewidth=1, alpha=0.7)
ax1_twin.plot(global_dep_delays['ds'].iloc[:min_len], flight_count_series, 'r-', label='Flight Count', linewidth=1, alpha=0.7)
ax1.set_xlabel('Date')
ax1.set_ylabel('Average Delay (minutes)', color='b')
ax1_twin.set_ylabel('Flight Count', color='r')
ax1.set_title('Flight Count vs Average Delay Over Time', fontweight='bold')
ax1.tick_params(axis='y', labelcolor='b')
ax1_twin.tick_params(axis='y', labelcolor='r')
ax1.grid(True, alpha=0.3)

# Scatter plot
axes[1].scatter(flight_count_series, delay_series, alpha=0.3, s=10)
axes[1].set_xlabel('Flight Count')
axes[1].set_ylabel('Average Departure Delay (minutes)')
axes[1].set_title('Flight Count vs Average Delay (Scatter)', fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Add correlation
correlation = np.corrcoef(flight_count_series, delay_series)[0, 1]
axes[1].text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
             transform=axes[1].transAxes, fontsize=12,
             verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Prophet Feature Generation

Use Prophet to extract time-series features (trend, seasonality, forecasts) that can be used as features in ML models.

In [0]:
# Fit Prophet model on global time series
print("Fitting Prophet model on global departure delays...")

# Prepare data for Prophet (requires 'ds' and 'y' columns)
prophet_data = global_dep_delays[['ds', 'y']].copy()
prophet_data = prophet_data.dropna()

# Initialize and fit Prophet model
# Enable yearly and weekly seasonality
prophet_model_global = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,  # Daily doesn't make sense for daily aggregated data
    seasonality_mode='multiplicative',  # or 'additive'
    interval_width=0.95,  # 95% confidence intervals
    changepoint_prior_scale=0.05  # Controls flexibility of trend changes
)

prophet_model_global.fit(prophet_data)

# Generate forecast for all dates in the training data (and potentially future dates)
# This gives us trend, seasonality components, and forecast values
forecast_global = prophet_model_global.predict(prophet_data[['ds']])

print(f"Prophet forecast generated for {len(forecast_global)} dates")
print("\nForecast columns:")
print(forecast_global.columns.tolist())
print("\nSample forecast:")
print(forecast_global[['ds', 'yhat', 'trend', 'yearly', 'weekly']].head(10))

In [0]:
# Extract Prophet features for joining back to flight data
# These features can be used in ML models

prophet_features_global = forecast_global[[
    'ds',
    'trend',              # Long-term trend component
    'yearly',             # Yearly seasonality component
    'weekly',             # Weekly seasonality component
    'yhat',               # Forecasted value (trend + seasonality)
    'yhat_lower',         # Lower bound of forecast interval
    'yhat_upper',         # Upper bound of forecast interval
]].copy()

# Rename for clarity when joining
prophet_features_global = prophet_features_global.rename(columns={
    'trend': 'prophet_trend_global',
    'yearly': 'prophet_yearly_seasonality_global',
    'weekly': 'prophet_weekly_seasonality_global',
    'yhat': 'prophet_forecast_global',
    'yhat_lower': 'prophet_forecast_lower_global',
    'yhat_upper': 'prophet_forecast_upper_global'
})

# Convert date to string format matching FL_DATE
prophet_features_global['date_str'] = prophet_features_global['ds'].dt.strftime('%Y-%m-%d')

print("Global Prophet features (sample):")
print(prophet_features_global.head(10))
print(f"\nTotal feature rows: {len(prophet_features_global)}")


### Per-Airport Prophet Models

In [0]:
# Fit Prophet models for each airport
# This will take longer but provides airport-specific trend and seasonality features

print("Fitting Prophet models for each airport...")
print(f"Total airports: {per_airport_ts['origin'].nunique()}")

# Check data availability per airport
airport_day_counts = per_airport_ts.groupby('origin')['ds'].count().sort_values(ascending=False)
print(f"\nData availability statistics:")
print(f"  Min days: {airport_day_counts.min()}")
print(f"  Max days: {airport_day_counts.max()}")
print(f"  Mean days: {airport_day_counts.mean():.1f}")
print(f"  Median days: {airport_day_counts.median():.1f}")

# Use a lower threshold - need at least 14 days for weekly seasonality
# With only 3 quarters (~270 days), we can't do yearly seasonality, but we can get trend and weekly patterns
min_days_required = 14  # At least 2 weeks for weekly seasonality
airports_with_sufficient_data = airport_day_counts[airport_day_counts >= min_days_required].index.tolist()

print(f"\nAirports with >= {min_days_required} days of data: {len(airports_with_sufficient_data)}")

# Fit Prophet for a subset of airports first (for testing)
# In production, fit for all airports
top_airports = airports_with_sufficient_data[:20]  # Start with top 20 airports
print(f"\nFitting Prophet models for {len(top_airports)} airports (sample)...")

prophet_features_per_airport = []

for airport in top_airports:
    airport_data = per_airport_ts[per_airport_ts['origin'] == airport].sort_values('ds')
    airport_prophet_data = airport_data[['ds', 'avg_dep_delay']].copy()
    airport_prophet_data = airport_prophet_data.rename(columns={'avg_dep_delay': 'y'})
    airport_prophet_data = airport_prophet_data.dropna()
    
    if len(airport_prophet_data) < min_days_required:
        continue
    
    try:
        # Determine seasonality based on data availability
        has_enough_for_yearly = len(airport_prophet_data) >= 365
        has_enough_for_weekly = len(airport_prophet_data) >= 14
        
        # Fit Prophet model with appropriate seasonality
        prophet_model = Prophet(
            yearly_seasonality=has_enough_for_yearly,  # Only if >=365 days
            weekly_seasonality=has_enough_for_weekly,  # Only if >=14 days
            daily_seasonality=False,
            seasonality_mode='multiplicative',
            interval_width=0.95,
            changepoint_prior_scale=0.05
        )
        prophet_model.fit(airport_prophet_data)
        
        # Generate forecast
        forecast = prophet_model.predict(airport_prophet_data[['ds']])
        
        # Extract features (yearly may not exist if not enough data)
        feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
        if 'yearly' in forecast.columns:
            feature_cols.insert(2, 'yearly')  # Insert after 'trend'
        
        features = forecast[feature_cols].copy()
        features['origin'] = airport
        
        # Rename columns
        rename_dict = {
            'trend': 'prophet_trend_origin',
            'weekly': 'prophet_weekly_seasonality_origin',
            'yhat': 'prophet_forecast_origin',
            'yhat_lower': 'prophet_forecast_lower_origin',
            'yhat_upper': 'prophet_forecast_upper_origin'
        }
        if 'yearly' in features.columns:
            rename_dict['yearly'] = 'prophet_yearly_seasonality_origin'
        
        features = features.rename(columns=rename_dict)
        features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
        
        prophet_features_per_airport.append(features)
        
    except Exception as e:
        print(f"Error fitting Prophet for airport {airport}: {str(e)}")
        continue

# Combine all airport features
if prophet_features_per_airport:
    prophet_features_airport_df = pd.concat(prophet_features_per_airport, ignore_index=True)
    print(f"\n✓ Generated Prophet features for {prophet_features_airport_df['origin'].nunique()} airports")
    print(f"Total feature rows: {len(prophet_features_airport_df)}")
    print("\nSample features:")
    print(prophet_features_airport_df.head(10))
else:
    print("\nNo Prophet features generated for airports")


## Conditional Expected Values for Flight Lineage Features

For Flight Lineage feature engineering, we need conditional expected values:
- **Per Carrier**: Expected delays, turn times, air times by carrier
- **Per Carrier-Airport**: Expected values conditional on carrier AND airport
- **Per Carrier-Airport-Time**: Expected values conditional on carrier, airport, AND time of day/day of week

These features will be used to compute:
- `expected_turn_time_carrier_airport`: Average time between arrival and departure for this carrier at this airport
- `expected_turn_time_carrier_airport_time`: Conditional on time of day
- `expected_air_time_route`: Average air time for origin-destination pair
- `expected_air_time_route_time_of_day`: Conditional on time of day
- And other conditional expected values needed for deterministic prediction formulas


In [None]:
# Prepare data for conditional expected values
# Need: carrier, origin, dest, date, time components, delays

print("Preparing data for conditional expected values...")

# Check available columns
carrier_cols = [c for c in ts_data.columns if 'carrier' in c.lower()]
print(f"Carrier columns: {carrier_cols}")

# Use op_carrier if available, otherwise use first carrier column
carrier_col = 'op_carrier' if 'op_carrier' in ts_data.columns else (carrier_cols[0] if carrier_cols else None)

if carrier_col is None:
    print("WARNING: No carrier column found. Skipping carrier-based features.")
else:
    print(f"Using carrier column: {carrier_col}\n")
    
    # Prepare time components
    ts_data_cond = ts_data_prep.withColumn(
        'hour', F.hour(to_timestamp(col('crs_dep_time').cast('string'), 'HHmm'))
    ).withColumn(
        'day_of_week', F.dayofweek(col('date'))
    ).withColumn(
        'month', F.month(col('date'))
    )
    
    # Add carrier and airport info
    if carrier_col in ts_data_cond.columns:
        ts_data_cond = ts_data_cond.withColumn('carrier', col(carrier_col))
    
    print(f"Data prepared: {ts_data_cond.count():,} flights")
    print(f"Carriers: {ts_data_cond.select('carrier').distinct().count()}")
    print(f"Airports: {ts_data_cond.select('origin').distinct().count()}")


### Per-Carrier Expected Values


In [None]:
# Compute per-carrier expected values
# These are unconditional averages by carrier (aggregated over all airports, times, etc.)

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Computing per-carrier expected values...")
    
    # Expected departure delay by carrier
    expected_dep_delay_carrier = (
        ts_data_cond
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier')
        .agg(
            F.avg('DEP_DELAY').alias('expected_dep_delay_carrier'),
            F.stddev('DEP_DELAY').alias('std_dep_delay_carrier'),
            F.count('*').alias('flight_count_carrier')
        )
    )
    
    # Expected arrival delay by carrier
    expected_arr_delay_carrier = (
        ts_data_cond
        .filter(col('ARR_DELAY').isNotNull())
        .groupBy('carrier')
        .agg(
            F.avg('ARR_DELAY').alias('expected_arr_delay_carrier'),
            F.stddev('ARR_DELAY').alias('std_arr_delay_carrier')
        )
    )
    
    # Expected air time by carrier (if available)
    expected_air_time_carrier = None
    if 'air_time' in ts_data_cond.columns:
        expected_air_time_carrier = (
            ts_data_cond
            .filter(col('air_time').isNotNull())
            .groupBy('carrier')
            .agg(
                F.avg('air_time').alias('expected_air_time_carrier'),
                F.stddev('air_time').alias('std_air_time_carrier')
            )
        )
    
    # Combine all carrier features
    carrier_features = expected_dep_delay_carrier.join(
        expected_arr_delay_carrier, 'carrier', 'outer'
    )
    
    if expected_air_time_carrier is not None:
        carrier_features = carrier_features.join(
            expected_air_time_carrier, 'carrier', 'outer'
        )
    
    print(f"Per-carrier features computed for {carrier_features.count()} carriers")
    print("\nSample per-carrier features:")
    display(carrier_features.limit(10))
    
    # Save
    carrier_features_path = f"{FOLDER_PATH}/expected_values_carrier.parquet"
    carrier_features.write.mode("overwrite").parquet(carrier_features_path)
    print(f"\nSaved to: {carrier_features_path}")
else:
    print("Carrier data not available. Run previous cell first.")


### Per-Carrier-Airport Expected Values


In [None]:
# Compute per-carrier-airport expected values
# These are conditional on both carrier AND airport

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Computing per-carrier-airport expected values...")
    print("This may take a while due to the large number of combinations...\n")
    
    # Expected departure delay by carrier-airport (origin)
    expected_dep_delay_carrier_airport = (
        ts_data_cond
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'origin')
        .agg(
            F.avg('DEP_DELAY').alias('expected_dep_delay_carrier_airport'),
            F.stddev('DEP_DELAY').alias('std_dep_delay_carrier_airport'),
            F.count('*').alias('flight_count_carrier_airport')
        )
    )
    
    # Expected arrival delay by carrier-airport (origin)
    expected_arr_delay_carrier_airport = (
        ts_data_cond
        .filter(col('ARR_DELAY').isNotNull())
        .groupBy('carrier', 'origin')
        .agg(
            F.avg('ARR_DELAY').alias('expected_arr_delay_carrier_airport'),
            F.stddev('ARR_DELAY').alias('std_arr_delay_carrier_airport')
        )
    )
    
    # Expected turn time by carrier-airport
    # Turn time = time between arrival at airport and next departure from same airport
    # We need to compute this from flight sequences, but for now we can use taxi times as proxy
    # Or compute from actual arrival to next scheduled departure
    
    # For now, compute expected taxi times as proxy for turn time components
    expected_taxi_out_carrier_airport = None
    expected_taxi_in_carrier_airport = None
    
    if 'taxi_out' in ts_data_cond.columns:
        expected_taxi_out_carrier_airport = (
            ts_data_cond
            .filter(col('taxi_out').isNotNull())
            .groupBy('carrier', 'origin')
            .agg(
                F.avg('taxi_out').alias('expected_taxi_out_carrier_airport'),
                F.stddev('taxi_out').alias('std_taxi_out_carrier_airport')
            )
        )
    
    if 'taxi_in' in ts_data_cond.columns:
        expected_taxi_in_carrier_airport = (
            ts_data_cond
            .filter(col('taxi_in').isNotNull())
            .groupBy('carrier', 'dest')
            .agg(
                F.avg('taxi_in').alias('expected_taxi_in_carrier_airport'),
                F.stddev('taxi_in').alias('std_taxi_in_carrier_airport')
            )
        )
    
    # Combine carrier-airport features
    carrier_airport_features = expected_dep_delay_carrier_airport.join(
        expected_arr_delay_carrier_airport, ['carrier', 'origin'], 'outer'
    )
    
    if expected_taxi_out_carrier_airport is not None:
        carrier_airport_features = carrier_airport_features.join(
            expected_taxi_out_carrier_airport, ['carrier', 'origin'], 'outer'
        )
    
    if expected_taxi_in_carrier_airport is not None:
        # Join on carrier and dest (rename dest to origin for join)
        taxi_in_renamed = expected_taxi_in_carrier_airport.withColumnRenamed('dest', 'origin')
        carrier_airport_features = carrier_airport_features.join(
            taxi_in_renamed, ['carrier', 'origin'], 'outer'
        )
    
    print(f"Per-carrier-airport features computed for {carrier_airport_features.count()} combinations")
    print("\nSample per-carrier-airport features:")
    display(carrier_airport_features.limit(10))
    
    # Save
    carrier_airport_features_path = f"{FOLDER_PATH}/expected_values_carrier_airport.parquet"
    carrier_airport_features.write.mode("overwrite").parquet(carrier_airport_features_path)
    print(f"\nSaved to: {carrier_airport_features_path}")
else:
    print("Carrier data not available. Run previous cell first.")


In [None]:
# Compute per-carrier-airport-time expected values
# Conditional on carrier, airport, AND time components (hour, day_of_week, month)

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Computing per-carrier-airport-time expected values...")
    print("This will create many combinations - using hour and day_of_week for now...\n")
    
    # Create time buckets to reduce cardinality
    # Hour buckets: 0-5 (early morning), 6-11 (morning), 12-17 (afternoon), 18-23 (evening)
    ts_data_cond_time = ts_data_cond.withColumn(
        'hour_bucket',
        when(col('hour').between(0, 5), 'early_morning')
        .when(col('hour').between(6, 11), 'morning')
        .when(col('hour').between(12, 17), 'afternoon')
        .otherwise('evening')
    )
    
    # Expected departure delay by carrier-airport-hour_bucket
    expected_dep_delay_carrier_airport_time = (
        ts_data_cond_time
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'origin', 'hour_bucket')
        .agg(
            F.avg('DEP_DELAY').alias('expected_dep_delay_carrier_airport_time'),
            F.stddev('DEP_DELAY').alias('std_dep_delay_carrier_airport_time'),
            F.count('*').alias('flight_count_carrier_airport_time')
        )
    )
    
    # Expected arrival delay by carrier-airport-hour_bucket
    expected_arr_delay_carrier_airport_time = (
        ts_data_cond_time
        .filter(col('ARR_DELAY').isNotNull())
        .groupBy('carrier', 'origin', 'hour_bucket')
        .agg(
            F.avg('ARR_DELAY').alias('expected_arr_delay_carrier_airport_time'),
            F.stddev('ARR_DELAY').alias('std_arr_delay_carrier_airport_time')
        )
    )
    
    # Expected taxi times by carrier-airport-hour_bucket
    expected_taxi_out_carrier_airport_time = None
    if 'taxi_out' in ts_data_cond_time.columns:
        expected_taxi_out_carrier_airport_time = (
            ts_data_cond_time
            .filter(col('taxi_out').isNotNull())
            .groupBy('carrier', 'origin', 'hour_bucket')
            .agg(
                F.avg('taxi_out').alias('expected_taxi_out_carrier_airport_time'),
                F.stddev('taxi_out').alias('std_taxi_out_carrier_airport_time')
            )
        )
    
    # Combine carrier-airport-time features
    carrier_airport_time_features = expected_dep_delay_carrier_airport_time.join(
        expected_arr_delay_carrier_airport_time, 
        ['carrier', 'origin', 'hour_bucket'], 
        'outer'
    )
    
    if expected_taxi_out_carrier_airport_time is not None:
        carrier_airport_time_features = carrier_airport_time_features.join(
            expected_taxi_out_carrier_airport_time,
            ['carrier', 'origin', 'hour_bucket'],
            'outer'
        )
    
    print(f"Per-carrier-airport-time features computed for {carrier_airport_time_features.count()} combinations")
    print("\nSample per-carrier-airport-time features:")
    display(carrier_airport_time_features.limit(10))
    
    # Save
    carrier_airport_time_features_path = f"{FOLDER_PATH}/expected_values_carrier_airport_time.parquet"
    carrier_airport_time_features.write.mode("overwrite").parquet(carrier_airport_time_features_path)
    print(f"\nSaved to: {carrier_airport_time_features_path}")
else:
    print("Carrier data not available. Run previous cell first.")


### Per-Route Expected Values

For Flight Lineage features, we also need expected air times by route (origin-destination pair)


In [None]:
# Compute per-route expected values
# Route = (origin, destination) pair

if 'ts_data_cond' in locals():
    print("Computing per-route expected values...")
    
    # Expected air time by route
    expected_air_time_route = None
    if 'air_time' in ts_data_cond.columns:
        expected_air_time_route = (
            ts_data_cond
            .filter(col('air_time').isNotNull())
            .groupBy('origin', 'dest')
            .agg(
                F.avg('air_time').alias('expected_air_time_route'),
                F.stddev('air_time').alias('std_air_time_route'),
                F.count('*').alias('flight_count_route')
            )
        )
        
        print(f"Per-route air time features computed for {expected_air_time_route.count()} routes")
        print("\nSample per-route features:")
        display(expected_air_time_route.limit(10))
        
        # Save
        route_features_path = f"{FOLDER_PATH}/expected_values_route.parquet"
        expected_air_time_route.write.mode("overwrite").parquet(route_features_path)
        print(f"\nSaved to: {route_features_path}")
    else:
        print("air_time column not available")
else:
    print("Data not available. Run previous cells first.")


### Per-Route-Time Expected Values

Conditional on route AND time components (for time-of-day and seasonal effects on air time)


## Prophet-Based Conditional Expected Values

Use Prophet models to generate time-series based conditional expected values with trends and seasonality.
These features capture temporal patterns better than simple averages and can be used for Flight Lineage feature engineering.

### Approach:
1. Aggregate time series by grouping (carrier, carrier-airport, carrier-airport-time, route, route-time)
2. Fit Prophet models for each group (if sufficient data)
3. Extract Prophet features (trend, seasonality, forecast) for each date
4. These features provide conditional expected values that account for temporal trends


### Per-Carrier Prophet Models

Generate time-series based expected delays per carrier using Prophet


In [None]:
# Generate per-carrier time series and fit Prophet models
# This provides conditional expected delays by carrier that account for temporal trends

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Generating per-carrier time series for Prophet models...")
    
    # Aggregate departure delays by carrier and date
    per_carrier_ts_spark = (
        ts_data_cond
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'date')
        .agg(
            F.avg('DEP_DELAY').alias('avg_dep_delay'),
            F.count('*').alias('flight_count')
        )
        .orderBy('carrier', 'date')
    )
    
    # Convert to pandas for Prophet
    per_carrier_ts = per_carrier_ts_spark.toPandas()
    per_carrier_ts['ds'] = pd.to_datetime(per_carrier_ts['date'])
    
    print(f"Per-carrier time series: {len(per_carrier_ts):,} rows")
    print(f"Number of carriers: {per_carrier_ts['carrier'].nunique()}")
    
    # Check data availability per carrier
    carrier_day_counts = per_carrier_ts.groupby('carrier')['ds'].count().sort_values(ascending=False)
    print(f"\nData availability statistics:")
    print(f"  Min days: {carrier_day_counts.min()}")
    print(f"  Max days: {carrier_day_counts.max()}")
    print(f"  Mean days: {carrier_day_counts.mean():.1f}")
    print(f"  Median days: {carrier_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality, 365 for yearly
    min_days_required = 14
    carriers_with_sufficient_data = carrier_day_counts[carrier_day_counts >= min_days_required].index.tolist()
    print(f"\nCarriers with >= {min_days_required} days of data: {len(carriers_with_sufficient_data)}")
    
    # Fit Prophet models for each carrier
    print(f"\nFitting Prophet models for {len(carriers_with_sufficient_data)} carriers...")
    
    prophet_features_per_carrier = []
    
    for carrier in carriers_with_sufficient_data:
        carrier_data = per_carrier_ts[per_carrier_ts['carrier'] == carrier].sort_values('ds')
        carrier_prophet_data = carrier_data[['ds', 'avg_dep_delay']].copy()
        carrier_prophet_data = carrier_prophet_data.rename(columns={'avg_dep_delay': 'y'})
        carrier_prophet_data = carrier_prophet_data.dropna()
        
        if len(carrier_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(carrier_prophet_data) >= 365
            has_enough_for_weekly = len(carrier_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(carrier_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(carrier_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['carrier'] = carrier
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_carrier',
                'weekly': 'prophet_weekly_seasonality_carrier',
                'yhat': 'prophet_forecast_dep_delay_carrier',
                'yhat_lower': 'prophet_forecast_lower_carrier',
                'yhat_upper': 'prophet_forecast_upper_carrier'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_carrier'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_carrier.append(features)
            
        except Exception as e:
            print(f"Error fitting Prophet for carrier {carrier}: {str(e)}")
            continue
    
    # Combine all carrier features
    if prophet_features_per_carrier:
        prophet_features_carrier_df = pd.concat(prophet_features_per_carrier, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_carrier_df['carrier'].nunique()} carriers")
        print(f"Total feature rows: {len(prophet_features_carrier_df)}")
        print("\nSample features:")
        print(prophet_features_carrier_df.head(10))
        
        # Save to parquet (convert back to Spark for saving)
        prophet_features_carrier_spark = spark.createDataFrame(prophet_features_carrier_df)
        carrier_prophet_features_path = f"{FOLDER_PATH}/prophet_features_carrier.parquet"
        prophet_features_carrier_spark.write.mode("overwrite").parquet(carrier_prophet_features_path)
        print(f"\nSaved to: {carrier_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for carriers")
else:
    print("Carrier data not available. Run previous cells first.")


### Per-Carrier-Airport Prophet Models

Generate time-series based expected delays per carrier-airport combination using Prophet.
This provides conditional expected values that account for both carrier and airport-specific temporal patterns.


# Generate per-carrier-airport time series and fit Prophet models
# This will take longer due to many combinations, so we'll filter to those with sufficient data

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Generating per-carrier-airport time series for Prophet models...")
    print("This may take a while due to the large number of combinations...\n")
    
    # Aggregate departure delays by carrier, airport, and date
    per_carrier_airport_ts_spark = (
        ts_data_cond
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'origin', 'date')
        .agg(
            F.avg('DEP_DELAY').alias('avg_dep_delay'),
            F.count('*').alias('flight_count')
        )
        .orderBy('carrier', 'origin', 'date')
    )
    
    # Convert to pandas for Prophet
    per_carrier_airport_ts = per_carrier_airport_ts_spark.toPandas()
    per_carrier_airport_ts['ds'] = pd.to_datetime(per_carrier_airport_ts['date'])
    
    print(f"Per-carrier-airport time series: {len(per_carrier_airport_ts):,} rows")
    print(f"Number of carrier-airport combinations: {per_carrier_airport_ts.groupby(['carrier', 'origin']).ngroups}")
    
    # Check data availability per carrier-airport
    carrier_airport_day_counts = (
        per_carrier_airport_ts
        .groupby(['carrier', 'origin'])['ds']
        .count()
        .sort_values(ascending=False)
    )
    
    print(f"\nData availability statistics:")
    print(f"  Min days: {carrier_airport_day_counts.min()}")
    print(f"  Max days: {carrier_airport_day_counts.max()}")
    print(f"  Mean days: {carrier_airport_day_counts.mean():.1f}")
    print(f"  Median days: {carrier_airport_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality
    min_days_required = 14
    carrier_airports_with_sufficient_data = (
        carrier_airport_day_counts[carrier_airport_day_counts >= min_days_required]
        .index
        .tolist()
    )
    
    print(f"\nCarrier-airport combinations with >= {min_days_required} days: {len(carrier_airports_with_sufficient_data)}")
    
    # Limit to top N combinations for initial testing (can remove limit for full run)
    # For production, fit models for all combinations with sufficient data
    max_combinations = 100  # Adjust based on compute resources
    if len(carrier_airports_with_sufficient_data) > max_combinations:
        print(f"Limiting to top {max_combinations} combinations by data availability...")
        top_combinations = carrier_airport_day_counts.head(max_combinations).index.tolist()
    else:
        top_combinations = carrier_airports_with_sufficient_data
    
    print(f"Fitting Prophet models for {len(top_combinations)} carrier-airport combinations...")
    
    prophet_features_per_carrier_airport = []
    
    for idx, (carrier, origin) in enumerate(top_combinations):
        if (idx + 1) % 10 == 0:
            print(f"  Processing {idx + 1}/{len(top_combinations)}: {carrier}-{origin}")
        
        carrier_airport_data = per_carrier_airport_ts[
            (per_carrier_airport_ts['carrier'] == carrier) & 
            (per_carrier_airport_ts['origin'] == origin)
        ].sort_values('ds')
        
        carrier_airport_prophet_data = carrier_airport_data[['ds', 'avg_dep_delay']].copy()
        carrier_airport_prophet_data = carrier_airport_prophet_data.rename(columns={'avg_dep_delay': 'y'})
        carrier_airport_prophet_data = carrier_airport_prophet_data.dropna()
        
        if len(carrier_airport_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(carrier_airport_prophet_data) >= 365
            has_enough_for_weekly = len(carrier_airport_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(carrier_airport_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(carrier_airport_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['carrier'] = carrier
            features['origin'] = origin
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_carrier_airport',
                'weekly': 'prophet_weekly_seasonality_carrier_airport',
                'yhat': 'prophet_forecast_dep_delay_carrier_airport',
                'yhat_lower': 'prophet_forecast_lower_carrier_airport',
                'yhat_upper': 'prophet_forecast_upper_carrier_airport'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_carrier_airport'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_carrier_airport.append(features)
            
        except Exception as e:
            print(f"  Error fitting Prophet for {carrier}-{origin}: {str(e)}")
            continue
    
    # Combine all carrier-airport features
    if prophet_features_per_carrier_airport:
        prophet_features_carrier_airport_df = pd.concat(prophet_features_per_carrier_airport, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_carrier_airport_df.groupby(['carrier', 'origin']).ngroups} carrier-airport combinations")
        print(f"Total feature rows: {len(prophet_features_carrier_airport_df)}")
        print("\nSample features:")
        print(prophet_features_carrier_airport_df.head(10))
        
        # Save to parquet
        prophet_features_carrier_airport_spark = spark.createDataFrame(prophet_features_carrier_airport_df)
        carrier_airport_prophet_features_path = f"{FOLDER_PATH}/prophet_features_carrier_airport.parquet"
        prophet_features_carrier_airport_spark.write.mode("overwrite").parquet(carrier_airport_prophet_features_path)
        print(f"\nSaved to: {carrier_airport_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for carrier-airport combinations")
else:
    print("Carrier data not available. Run previous cells first.")


### Per-Carrier-Airport-Time Prophet Models

Generate time-series based expected delays per carrier-airport-time combination using Prophet.
This provides conditional expected values that account for carrier, airport, AND time-of-day patterns.
Note: We use time buckets (early_morning, morning, afternoon, evening) to reduce cardinality while still capturing time-of-day effects.


In [None]:
# Generate per-carrier-airport-time time series and fit Prophet models
# Using time buckets to reduce cardinality while capturing time-of-day effects

if 'ts_data_cond_time' in locals() and 'carrier' in ts_data_cond_time.columns:
    print("Generating per-carrier-airport-time time series for Prophet models...")
    print("Using time buckets (early_morning, morning, afternoon, evening)...\n")
    
    # Aggregate departure delays by carrier, airport, time bucket, and date
    per_carrier_airport_time_ts_spark = (
        ts_data_cond_time
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'origin', 'hour_bucket', 'date')
        .agg(
            F.avg('DEP_DELAY').alias('avg_dep_delay'),
            F.count('*').alias('flight_count')
        )
        .orderBy('carrier', 'origin', 'hour_bucket', 'date')
    )
    
    # Convert to pandas for Prophet
    per_carrier_airport_time_ts = per_carrier_airport_time_ts_spark.toPandas()
    per_carrier_airport_time_ts['ds'] = pd.to_datetime(per_carrier_airport_time_ts['date'])
    
    print(f"Per-carrier-airport-time time series: {len(per_carrier_airport_time_ts):,} rows")
    print(f"Number of carrier-airport-time combinations: {per_carrier_airport_time_ts.groupby(['carrier', 'origin', 'hour_bucket']).ngroups}")
    
    # Check data availability per carrier-airport-time
    carrier_airport_time_day_counts = (
        per_carrier_airport_time_ts
        .groupby(['carrier', 'origin', 'hour_bucket'])['ds']
        .count()
        .sort_values(ascending=False)
    )
    
    print(f"\nData availability statistics:")
    print(f"  Min days: {carrier_airport_time_day_counts.min()}")
    print(f"  Max days: {carrier_airport_time_day_counts.max()}")
    print(f"  Mean days: {carrier_airport_time_day_counts.mean():.1f}")
    print(f"  Median days: {carrier_airport_time_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality
    min_days_required = 14
    carrier_airport_times_with_sufficient_data = (
        carrier_airport_time_day_counts[carrier_airport_time_day_counts >= min_days_required]
        .index
        .tolist()
    )
    
    print(f"\nCarrier-airport-time combinations with >= {min_days_required} days: {len(carrier_airport_times_with_sufficient_data)}")
    
    # Limit to top N combinations for initial testing
    max_combinations = 200  # Can adjust based on compute resources
    if len(carrier_airport_times_with_sufficient_data) > max_combinations:
        print(f"Limiting to top {max_combinations} combinations by data availability...")
        top_combinations = carrier_airport_time_day_counts.head(max_combinations).index.tolist()
    else:
        top_combinations = carrier_airport_times_with_sufficient_data
    
    print(f"Fitting Prophet models for {len(top_combinations)} carrier-airport-time combinations...")
    
    prophet_features_per_carrier_airport_time = []
    
    for idx, (carrier, origin, hour_bucket) in enumerate(top_combinations):
        if (idx + 1) % 20 == 0:
            print(f"  Processing {idx + 1}/{len(top_combinations)}: {carrier}-{origin}-{hour_bucket}")
        
        carrier_airport_time_data = per_carrier_airport_time_ts[
            (per_carrier_airport_time_ts['carrier'] == carrier) & 
            (per_carrier_airport_time_ts['origin'] == origin) &
            (per_carrier_airport_time_ts['hour_bucket'] == hour_bucket)
        ].sort_values('ds')
        
        carrier_airport_time_prophet_data = carrier_airport_time_data[['ds', 'avg_dep_delay']].copy()
        carrier_airport_time_prophet_data = carrier_airport_time_prophet_data.rename(columns={'avg_dep_delay': 'y'})
        carrier_airport_time_prophet_data = carrier_airport_time_prophet_data.dropna()
        
        if len(carrier_airport_time_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(carrier_airport_time_prophet_data) >= 365
            has_enough_for_weekly = len(carrier_airport_time_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(carrier_airport_time_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(carrier_airport_time_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['carrier'] = carrier
            features['origin'] = origin
            features['hour_bucket'] = hour_bucket
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_carrier_airport_time',
                'weekly': 'prophet_weekly_seasonality_carrier_airport_time',
                'yhat': 'prophet_forecast_dep_delay_carrier_airport_time',
                'yhat_lower': 'prophet_forecast_lower_carrier_airport_time',
                'yhat_upper': 'prophet_forecast_upper_carrier_airport_time'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_carrier_airport_time'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_carrier_airport_time.append(features)
            
        except Exception as e:
            print(f"  Error fitting Prophet for {carrier}-{origin}-{hour_bucket}: {str(e)}")
            continue
    
    # Combine all carrier-airport-time features
    if prophet_features_per_carrier_airport_time:
        prophet_features_carrier_airport_time_df = pd.concat(prophet_features_per_carrier_airport_time, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_carrier_airport_time_df.groupby(['carrier', 'origin', 'hour_bucket']).ngroups} carrier-airport-time combinations")
        print(f"Total feature rows: {len(prophet_features_carrier_airport_time_df)}")
        print("\nSample features:")
        print(prophet_features_carrier_airport_time_df.head(10))
        
        # Save to parquet
        prophet_features_carrier_airport_time_spark = spark.createDataFrame(prophet_features_carrier_airport_time_df)
        carrier_airport_time_prophet_features_path = f"{FOLDER_PATH}/prophet_features_carrier_airport_time.parquet"
        prophet_features_carrier_airport_time_spark.write.mode("overwrite").parquet(carrier_airport_time_prophet_features_path)
        print(f"\nSaved to: {carrier_airport_time_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for carrier-airport-time combinations")
else:
    print("Carrier-time data not available. Run previous cells first (including time bucket creation).")


### Per-Route Prophet Models (Air Time)

Generate time-series based expected air times per route (origin-destination pair) using Prophet.
This provides conditional expected air times that account for temporal trends and seasonality.
Important for Flight Lineage features to predict expected flight duration.


In [None]:
# Generate per-route time series for air time and fit Prophet models
# This provides conditional expected air times by route that account for temporal trends

if 'ts_data_cond' in locals() and 'air_time' in ts_data_cond.columns:
    print("Generating per-route time series for air time Prophet models...")
    
    # Aggregate air time by route (origin-dest) and date
    per_route_ts_spark = (
        ts_data_cond
        .filter(col('air_time').isNotNull())
        .groupBy('origin', 'dest', 'date')
        .agg(
            F.avg('air_time').alias('avg_air_time'),
            F.count('*').alias('flight_count')
        )
        .orderBy('origin', 'dest', 'date')
    )
    
    # Convert to pandas for Prophet
    per_route_ts = per_route_ts_spark.toPandas()
    per_route_ts['ds'] = pd.to_datetime(per_route_ts['date'])
    
    print(f"Per-route time series: {len(per_route_ts):,} rows")
    print(f"Number of routes: {per_route_ts.groupby(['origin', 'dest']).ngroups}")
    
    # Check data availability per route
    route_day_counts = (
        per_route_ts
        .groupby(['origin', 'dest'])['ds']
        .count()
        .sort_values(ascending=False)
    )
    
    print(f"\nData availability statistics:")
    print(f"  Min days: {route_day_counts.min()}")
    print(f"  Max days: {route_day_counts.max()}")
    print(f"  Mean days: {route_day_counts.mean():.1f}")
    print(f"  Median days: {route_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality
    min_days_required = 14
    routes_with_sufficient_data = (
        route_day_counts[route_day_counts >= min_days_required]
        .index
        .tolist()
    )
    
    print(f"\nRoutes with >= {min_days_required} days: {len(routes_with_sufficient_data)}")
    
    # Limit to top N routes for initial testing
    max_routes = 150  # Can adjust based on compute resources
    if len(routes_with_sufficient_data) > max_routes:
        print(f"Limiting to top {max_routes} routes by data availability...")
        top_routes = route_day_counts.head(max_routes).index.tolist()
    else:
        top_routes = routes_with_sufficient_data
    
    print(f"Fitting Prophet models for {len(top_routes)} routes...")
    
    prophet_features_per_route = []
    
    for idx, (origin, dest) in enumerate(top_routes):
        if (idx + 1) % 20 == 0:
            print(f"  Processing {idx + 1}/{len(top_routes)}: {origin}-{dest}")
        
        route_data = per_route_ts[
            (per_route_ts['origin'] == origin) & 
            (per_route_ts['dest'] == dest)
        ].sort_values('ds')
        
        route_prophet_data = route_data[['ds', 'avg_air_time']].copy()
        route_prophet_data = route_prophet_data.rename(columns={'avg_air_time': 'y'})
        route_prophet_data = route_prophet_data.dropna()
        
        if len(route_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(route_prophet_data) >= 365
            has_enough_for_weekly = len(route_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(route_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(route_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['origin'] = origin
            features['dest'] = dest
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_route',
                'weekly': 'prophet_weekly_seasonality_route',
                'yhat': 'prophet_forecast_air_time_route',
                'yhat_lower': 'prophet_forecast_lower_route',
                'yhat_upper': 'prophet_forecast_upper_route'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_route'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_route.append(features)
            
        except Exception as e:
            print(f"  Error fitting Prophet for {origin}-{dest}: {str(e)}")
            continue
    
    # Combine all route features
    if prophet_features_per_route:
        prophet_features_route_df = pd.concat(prophet_features_per_route, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_route_df.groupby(['origin', 'dest']).ngroups} routes")
        print(f"Total feature rows: {len(prophet_features_route_df)}")
        print("\nSample features:")
        print(prophet_features_route_df.head(10))
        
        # Save to parquet
        prophet_features_route_spark = spark.createDataFrame(prophet_features_route_df)
        route_prophet_features_path = f"{FOLDER_PATH}/prophet_features_route.parquet"
        prophet_features_route_spark.write.mode("overwrite").parquet(route_prophet_features_path)
        print(f"\nSaved to: {route_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for routes")
else:
    print("Route data not available. Run previous cells first.")


### Per-Route-Time Prophet Models (Air Time)

Generate time-series based expected air times per route-time combination using Prophet.
This provides conditional expected air times that account for route, time-of-day, AND temporal trends.
Important for Flight Lineage features to predict expected flight duration conditional on time of day.


In [None]:
# Generate per-route-time time series for air time and fit Prophet models
# Using time buckets to reduce cardinality while capturing time-of-day effects

if 'ts_data_cond_time' in locals() and 'air_time' in ts_data_cond_time.columns:
    print("Generating per-route-time time series for air time Prophet models...")
    print("Using time buckets (early_morning, morning, afternoon, evening)...\n")
    
    # Aggregate air time by route, time bucket, and date
    per_route_time_ts_spark = (
        ts_data_cond_time
        .filter(col('air_time').isNotNull())
        .groupBy('origin', 'dest', 'hour_bucket', 'date')
        .agg(
            F.avg('air_time').alias('avg_air_time'),
            F.count('*').alias('flight_count')
        )
        .orderBy('origin', 'dest', 'hour_bucket', 'date')
    )
    
    # Convert to pandas for Prophet
    per_route_time_ts = per_route_time_ts_spark.toPandas()
    per_route_time_ts['ds'] = pd.to_datetime(per_route_time_ts['date'])
    
    print(f"Per-route-time time series: {len(per_route_time_ts):,} rows")
    print(f"Number of route-time combinations: {per_route_time_ts.groupby(['origin', 'dest', 'hour_bucket']).ngroups}")
    
    # Check data availability per route-time
    route_time_day_counts = (
        per_route_time_ts
        .groupby(['origin', 'dest', 'hour_bucket'])['ds']
        .count()
        .sort_values(ascending=False)
    )
    
    print(f"\nData availability statistics:")
    print(f"  Min days: {route_time_day_counts.min()}")
    print(f"  Max days: {route_time_day_counts.max()}")
    print(f"  Mean days: {route_time_day_counts.mean():.1f}")
    print(f"  Median days: {route_time_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality
    min_days_required = 14
    route_times_with_sufficient_data = (
        route_time_day_counts[route_time_day_counts >= min_days_required]
        .index
        .tolist()
    )
    
    print(f"\nRoute-time combinations with >= {min_days_required} days: {len(route_times_with_sufficient_data)}")
    
    # Limit to top N combinations for initial testing
    max_combinations = 200  # Can adjust based on compute resources
    if len(route_times_with_sufficient_data) > max_combinations:
        print(f"Limiting to top {max_combinations} combinations by data availability...")
        top_combinations = route_time_day_counts.head(max_combinations).index.tolist()
    else:
        top_combinations = route_times_with_sufficient_data
    
    print(f"Fitting Prophet models for {len(top_combinations)} route-time combinations...")
    
    prophet_features_per_route_time = []
    
    for idx, (origin, dest, hour_bucket) in enumerate(top_combinations):
        if (idx + 1) % 20 == 0:
            print(f"  Processing {idx + 1}/{len(top_combinations)}: {origin}-{dest}-{hour_bucket}")
        
        route_time_data = per_route_time_ts[
            (per_route_time_ts['origin'] == origin) & 
            (per_route_time_ts['dest'] == dest) &
            (per_route_time_ts['hour_bucket'] == hour_bucket)
        ].sort_values('ds')
        
        route_time_prophet_data = route_time_data[['ds', 'avg_air_time']].copy()
        route_time_prophet_data = route_time_prophet_data.rename(columns={'avg_air_time': 'y'})
        route_time_prophet_data = route_time_prophet_data.dropna()
        
        if len(route_time_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(route_time_prophet_data) >= 365
            has_enough_for_weekly = len(route_time_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(route_time_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(route_time_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['origin'] = origin
            features['dest'] = dest
            features['hour_bucket'] = hour_bucket
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_route_time',
                'weekly': 'prophet_weekly_seasonality_route_time',
                'yhat': 'prophet_forecast_air_time_route_time',
                'yhat_lower': 'prophet_forecast_lower_route_time',
                'yhat_upper': 'prophet_forecast_upper_route_time'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_route_time'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_route_time.append(features)
            
        except Exception as e:
            print(f"  Error fitting Prophet for {origin}-{dest}-{hour_bucket}: {str(e)}")
            continue
    
    # Combine all route-time features
    if prophet_features_per_route_time:
        prophet_features_route_time_df = pd.concat(prophet_features_per_route_time, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_route_time_df.groupby(['origin', 'dest', 'hour_bucket']).ngroups} route-time combinations")
        print(f"Total feature rows: {len(prophet_features_route_time_df)}")
        print("\nSample features:")
        print(prophet_features_route_time_df.head(10))
        
        # Save to parquet
        prophet_features_route_time_spark = spark.createDataFrame(prophet_features_route_time_df)
        route_time_prophet_features_path = f"{FOLDER_PATH}/prophet_features_route_time.parquet"
        prophet_features_route_time_spark.write.mode("overwrite").parquet(route_time_prophet_features_path)
        print(f"\nSaved to: {route_time_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for route-time combinations")
else:
    print("Route-time data not available. Run previous cells first (including time bucket creation).")


## Summary: Prophet-Based Conditional Expected Values

### Generated Features

The following Prophet-based features have been generated and saved to parquet files:

1. **Per-Carrier Features** (`prophet_features_carrier.parquet`)
   - `prophet_forecast_dep_delay_carrier`: Expected departure delay by carrier (with temporal trends)
   - `prophet_trend_carrier`: Long-term trend component
   - `prophet_weekly_seasonality_carrier`: Weekly seasonality component
   - `prophet_yearly_seasonality_carrier`: Yearly seasonality component (if sufficient data)
   - Join keys: `carrier`, `date_str`

2. **Per-Carrier-Airport Features** (`prophet_features_carrier_airport.parquet`)
   - `prophet_forecast_dep_delay_carrier_airport`: Expected departure delay by carrier-airport
   - Similar trend/seasonality components
   - Join keys: `carrier`, `origin`, `date_str`

3. **Per-Carrier-Airport-Time Features** (`prophet_features_carrier_airport_time.parquet`)
   - `prophet_forecast_dep_delay_carrier_airport_time`: Expected departure delay by carrier-airport-time
   - Conditional on time bucket (early_morning, morning, afternoon, evening)
   - Join keys: `carrier`, `origin`, `hour_bucket`, `date_str`

4. **Per-Route Features** (`prophet_features_route.parquet`)
   - `prophet_forecast_air_time_route`: Expected air time by route (origin-dest)
   - Important for Flight Lineage features to predict flight duration
   - Join keys: `origin`, `dest`, `date_str`

5. **Per-Route-Time Features** (`prophet_features_route_time.parquet`)
   - `prophet_forecast_air_time_route_time`: Expected air time by route-time
   - Conditional on time bucket
   - Join keys: `origin`, `dest`, `hour_bucket`, `date_str`

### Usage in Flight Lineage Features

These Prophet features can be used to replace or supplement the simple average-based conditional expected values:

- **Expected Turn Time**: Use `prophet_forecast_dep_delay_carrier_airport_time` to predict delays that affect turn time
- **Expected Air Time**: Use `prophet_forecast_air_time_route_time` for route-specific air time predictions
- **Conditional Expected Delays**: Use carrier/airport/time-specific forecasts for more accurate delay expectations

### Next Steps

1. Load these features in Flight Lineage Features Experiment notebook
2. Join to flight data using the appropriate join keys
3. Use Prophet forecasts as features in the deterministic prediction formulas
4. Compare performance with simple average-based features


In [None]:
# Generate per-carrier-airport time series and fit Prophet models
# This will take longer due to many combinations, so we'll filter to those with sufficient data

if 'ts_data_cond' in locals() and 'carrier' in ts_data_cond.columns:
    print("Generating per-carrier-airport time series for Prophet models...")
    print("This may take a while due to the large number of combinations...\n")
    
    # Aggregate departure delays by carrier, airport, and date
    per_carrier_airport_ts_spark = (
        ts_data_cond
        .filter(col('DEP_DELAY').isNotNull())
        .groupBy('carrier', 'origin', 'date')
        .agg(
            F.avg('DEP_DELAY').alias('avg_dep_delay'),
            F.count('*').alias('flight_count')
        )
        .orderBy('carrier', 'origin', 'date')
    )
    
    # Convert to pandas for Prophet
    per_carrier_airport_ts = per_carrier_airport_ts_spark.toPandas()
    per_carrier_airport_ts['ds'] = pd.to_datetime(per_carrier_airport_ts['date'])
    
    print(f"Per-carrier-airport time series: {len(per_carrier_airport_ts):,} rows")
    print(f"Number of carrier-airport combinations: {per_carrier_airport_ts.groupby(['carrier', 'origin']).ngroups}")
    
    # Check data availability per carrier-airport
    carrier_airport_day_counts = (
        per_carrier_airport_ts
        .groupby(['carrier', 'origin'])['ds']
        .count()
        .sort_values(ascending=False)
    )
    
    print(f"\nData availability statistics:")
    print(f"  Min days: {carrier_airport_day_counts.min()}")
    print(f"  Max days: {carrier_airport_day_counts.max()}")
    print(f"  Mean days: {carrier_airport_day_counts.mean():.1f}")
    print(f"  Median days: {carrier_airport_day_counts.median():.1f}")
    
    # Need at least 14 days for weekly seasonality
    min_days_required = 14
    carrier_airports_with_sufficient_data = (
        carrier_airport_day_counts[carrier_airport_day_counts >= min_days_required]
        .index
        .tolist()
    )
    
    print(f"\nCarrier-airport combinations with >= {min_days_required} days: {len(carrier_airports_with_sufficient_data)}")
    
    # Limit to top N combinations for initial testing (can remove limit for full run)
    # For production, fit models for all combinations with sufficient data
    max_combinations = 100  # Adjust based on compute resources
    if len(carrier_airports_with_sufficient_data) > max_combinations:
        print(f"Limiting to top {max_combinations} combinations by data availability...")
        top_combinations = carrier_airport_day_counts.head(max_combinations).index.tolist()
    else:
        top_combinations = carrier_airports_with_sufficient_data
    
    print(f"Fitting Prophet models for {len(top_combinations)} carrier-airport combinations...")
    
    prophet_features_per_carrier_airport = []
    
    for idx, (carrier, origin) in enumerate(top_combinations):
        if (idx + 1) % 10 == 0:
            print(f"  Processing {idx + 1}/{len(top_combinations)}: {carrier}-{origin}")
        
        carrier_airport_data = per_carrier_airport_ts[
            (per_carrier_airport_ts['carrier'] == carrier) & 
            (per_carrier_airport_ts['origin'] == origin)
        ].sort_values('ds')
        
        carrier_airport_prophet_data = carrier_airport_data[['ds', 'avg_dep_delay']].copy()
        carrier_airport_prophet_data = carrier_airport_prophet_data.rename(columns={'avg_dep_delay': 'y'})
        carrier_airport_prophet_data = carrier_airport_prophet_data.dropna()
        
        if len(carrier_airport_prophet_data) < min_days_required:
            continue
        
        try:
            # Determine seasonality based on data availability
            has_enough_for_yearly = len(carrier_airport_prophet_data) >= 365
            has_enough_for_weekly = len(carrier_airport_prophet_data) >= 14
            
            # Fit Prophet model
            prophet_model = Prophet(
                yearly_seasonality=has_enough_for_yearly,
                weekly_seasonality=has_enough_for_weekly,
                daily_seasonality=False,
                seasonality_mode='multiplicative',
                interval_width=0.95,
                changepoint_prior_scale=0.05
            )
            prophet_model.fit(carrier_airport_prophet_data)
            
            # Generate forecast
            forecast = prophet_model.predict(carrier_airport_prophet_data[['ds']])
            
            # Extract features
            feature_cols = ['ds', 'trend', 'weekly', 'yhat', 'yhat_lower', 'yhat_upper']
            if 'yearly' in forecast.columns:
                feature_cols.insert(2, 'yearly')
            
            features = forecast[feature_cols].copy()
            features['carrier'] = carrier
            features['origin'] = origin
            
            # Rename columns
            rename_dict = {
                'trend': 'prophet_trend_carrier_airport',
                'weekly': 'prophet_weekly_seasonality_carrier_airport',
                'yhat': 'prophet_forecast_dep_delay_carrier_airport',
                'yhat_lower': 'prophet_forecast_lower_carrier_airport',
                'yhat_upper': 'prophet_forecast_upper_carrier_airport'
            }
            if 'yearly' in features.columns:
                rename_dict['yearly'] = 'prophet_yearly_seasonality_carrier_airport'
            
            features = features.rename(columns=rename_dict)
            features['date_str'] = features['ds'].dt.strftime('%Y-%m-%d')
            
            prophet_features_per_carrier_airport.append(features)
            
        except Exception as e:
            print(f"  Error fitting Prophet for {carrier}-{origin}: {str(e)}")
            continue
    
    # Combine all carrier-airport features
    if prophet_features_per_carrier_airport:
        prophet_features_carrier_airport_df = pd.concat(prophet_features_per_carrier_airport, ignore_index=True)
        print(f"\n✓ Generated Prophet features for {prophet_features_carrier_airport_df.groupby(['carrier', 'origin']).ngroups} carrier-airport combinations")
        print(f"Total feature rows: {len(prophet_features_carrier_airport_df)}")
        print("\nSample features:")
        print(prophet_features_carrier_airport_df.head(10))
        
        # Save to parquet
        prophet_features_carrier_airport_spark = spark.createDataFrame(prophet_features_carrier_airport_df)
        carrier_airport_prophet_features_path = f"{FOLDER_PATH}/prophet_features_carrier_airport.parquet"
        prophet_features_carrier_airport_spark.write.mode("overwrite").parquet(carrier_airport_prophet_features_path)
        print(f"\nSaved to: {carrier_airport_prophet_features_path}")
    else:
        print("\nNo Prophet features generated for carrier-airport combinations")
else:
    print("Carrier data not available. Run previous cells first.")


In [None]:
# Compute per-route-time expected values
# Conditional on route AND time components

if 'ts_data_cond_time' in locals() and 'air_time' in ts_data_cond_time.columns:
    print("Computing per-route-time expected values...")
    
    # Expected air time by route-hour_bucket
    expected_air_time_route_time = (
        ts_data_cond_time
        .filter(col('air_time').isNotNull())
        .groupBy('origin', 'dest', 'hour_bucket')
        .agg(
            F.avg('air_time').alias('expected_air_time_route_time'),
            F.stddev('air_time').alias('std_air_time_route_time'),
            F.count('*').alias('flight_count_route_time')
        )
    )
    
    print(f"Per-route-time air time features computed for {expected_air_time_route_time.count()} combinations")
    print("\nSample per-route-time features:")
    display(expected_air_time_route_time.limit(10))
    
    # Save
    route_time_features_path = f"{FOLDER_PATH}/expected_values_route_time.parquet"
    expected_air_time_route_time.write.mode("overwrite").parquet(route_time_features_path)
    print(f"\nSaved to: {route_time_features_path}")
else:
    print("Data not available. Run previous cells first.")
