In [0]:
# Dependencies
import importlib.util
import sys
import pandas as pd

# Load modules
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

graph_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/graph_features.py"
spec = importlib.util.spec_from_file_location("graph_features", graph_features_path)
graph_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(graph_features)

time_series_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/time_series_features.py"
spec = importlib.util.spec_from_file_location("time_series_features", time_series_features_path)
time_series_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(time_series_features)

from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

## Define Features

In [0]:
categorical_features = [
    'day_of_week',
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    'day_of_month',
    'month', # cyclical patterns
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time',
    'distance',
    'elevation',
]

# Graph features that will be added by GraphFeaturesEstimator
graph_feature_cols = [
    'origin_pagerank_weighted',
    'origin_pagerank_unweighted',
    'dest_pagerank_weighted',
    'dest_pagerank_unweighted'
]

# Time-series features that will be added by TimeSeriesFeaturesEstimator
# Note: yearly_seasonality features are only added if >=365 days of data available
time_series_feature_cols = [
    # Global features (guaranteed)
    'prophet_forecast_dep_delay_global',
    'prophet_trend_global',
    'prophet_weekly_seasonality_global',
    # Carrier features (guaranteed if carrier has >=14 days)
    'prophet_forecast_dep_delay_carrier',
    'prophet_trend_carrier',
    'prophet_weekly_seasonality_carrier',
    # Airport features (guaranteed if airport has >=14 days)
    'prophet_forecast_dep_delay_airport',
    'prophet_trend_airport',
    'prophet_weekly_seasonality_airport',
    # Optional yearly seasonality features (only if >=365 days of data)
    # These may not exist for all carriers/airports, so handle gracefully
    'prophet_yearly_seasonality_global',
    'prophet_yearly_seasonality_carrier',
    'prophet_yearly_seasonality_airport',
]

## Construct Model Pipeline

In [0]:
# Graph Features Estimator (builds graph and adds PageRank features)
graph_estimator = graph_features.GraphFeaturesEstimator(
    origin_col="origin",
    dest_col="dest",
    reset_probability=0.15,
    max_iter=10
)

# Time-Series Features Estimator (generates Prophet-based time-series features)
time_series_estimator = time_series_features.TimeSeriesFeaturesEstimator(
    date_col="FL_DATE",
    carrier_col="op_carrier",
    origin_col="origin",
    delay_col="DEP_DELAY",
    min_days_required=14,
    changepoint_prior_scale=0.05
)

imputer = Imputer(
    inputCols=numerical_features + graph_feature_cols + time_series_feature_cols,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols + time_series_feature_cols],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols + time_series_feature_cols],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY",
    solver="normal",  # Required for p-values and standard errors
    regParam=0.0,  # Required for statistical significance measures
    elasticNetParam=0.0,
)

# Pipeline with graph features and time-series features
lr_pipe = Pipeline(stages=[graph_estimator, time_series_estimator, imputer, indexer, encoder, assembler, scaler, lr])

## Run Cross-Validation with Graph Features

In [0]:
cv_obj = cv.FlightDelayCV(
    estimator=lr_pipe,
    version="3M"
)
cv_obj.fit()

In [0]:
# Train on overlapping folds
cv_obj_overlapping = cv.FlightDelayCV(
    estimator=lr_pipe,
    version="3M",
    fold_strategy="overlapping",
    n_folds=3, 
    train_window_sections=2,
    n_sections=5
)
cv_obj_overlapping.fit()

## View Cross-Validation Results

In [0]:
print("Cross-Validation Results:")
print(cv_obj.metrics)
print("\nMean metrics across folds:")
print(pd.DataFrame(cv_obj.metrics).mean())

In [0]:
print("Cross-Validation Results:")
print(cv_obj_overlapping.metrics)
print("\nMean metrics across folds:")
print(pd.DataFrame(cv_obj_overlapping.metrics).mean())

# END OF DEMO
Below validates the model pipeline works with graph features

## Verify Graph Features Ran Successfully
Access pipeline and model objects to validate graph features loaded correctly and were used in the model (Kudos Claude)

In [0]:
# Verify graph features were used in the model
print("=== Graph Features Verification ===\n")

# Check pipeline stages to confirm graph estimator is present
print("Pipeline stages:")
for i, stage in enumerate(cv_obj.models[0].stages):
    stage_name = type(stage).__name__
    print(f"  {i}: {stage_name}")
    if 'Graph' in stage_name:
        print(f"      ✓ Graph features estimator found!")

# Check if graph features are in transformed data
print("\nChecking transformed validation data for graph features...")
sample_val_df = cv_obj.models[0].transform(cv_obj.folds[0][1])  # First fold's validation data
graph_cols = [col for col in sample_val_df.columns if 'pagerank' in col.lower()]
print(f"Graph feature columns found: {graph_cols}")

# Show sample of graph features
print("\nSample graph feature values:")
sample_val_df.select("origin", "dest", *graph_cols).show(5)

# Get Linear Regression model and coefficients
print("\n=== Graph Feature Coefficients ===")
lr_model = cv_obj.models[0].stages[-1]  # Last stage is LinearRegression
coefficients = lr_model.coefficients.toArray()

# Get assembler to find feature order
assembler = cv_obj.models[0].stages[4]  # VectorAssembler
input_cols = assembler.getInputCols()

# Count categorical features (one-hot encoded vectors)
sample_row = cv_obj.folds[0][1].limit(1)
transformed_before_assembler = cv_obj.models[0].stages[3].transform(  # OneHotEncoder
    cv_obj.models[0].stages[2].transform(  # StringIndexer
        cv_obj.models[0].stages[1].transform(  # Imputer
            cv_obj.models[0].stages[0].transform(sample_row)  # GraphFeatures
        )
    )
)

categorical_vec_cols = [col for col in input_cols if col.endswith("_VEC")]
categorical_feature_count = 0
for col in categorical_vec_cols:
    if col in transformed_before_assembler.columns:
        vec = transformed_before_assembler.select(col).first()[0]
        if vec:
            categorical_feature_count += len(vec.toArray())

# Numerical features (including graph) start after categorical
numerical_start_idx = categorical_feature_count
all_numerical_imputed = [f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols]

print(f"Total categorical features: {categorical_feature_count}")
print(f"Numerical features start at index: {numerical_start_idx}")
print(f"\nGraph feature coefficients:")
print(f"{'Feature':<40} {'Coefficient':<15}")
print("-" * 55)

graph_coefs = {}
for feat_name in graph_feature_cols:
    feat_imputed = f"{feat_name}_IMPUTED"
    if feat_imputed in all_numerical_imputed:
        coef_idx = numerical_start_idx + all_numerical_imputed.index(feat_imputed)
        if coef_idx < len(coefficients):
            coef = coefficients[coef_idx]
            graph_coefs[feat_name] = coef
            print(f"{feat_name:<40} {coef:>14.6f}")
        else:
            print(f"{feat_name:<40} {'Index out of range':<15}")
    else:
        print(f"{feat_name:<40} {'Not found':<15}")

## Graph Feature Statistical Significance (Simplified Model)
Simple model with only graph featues to determine statistical signifance of these features (Kudos Claude)

In [0]:
# Workaround: Fit a simplified model with just graph features to get statistics
print("=== Workaround: Simplified Model for Graph Feature Significance ===\n")

# Get transformed training data with graph features
train_df_with_graph = cv_obj.models[0].stages[0].transform(cv_obj.folds[0][0])  # GraphFeaturesModel
train_df_with_graph = cv_obj.models[0].stages[1].transform(train_df_with_graph)  # Imputer

# Select only graph features + label for simplified model
graph_features_imputed = [f"{col}_IMPUTED" for col in graph_feature_cols]
simplified_features = graph_features_imputed + ["DEP_DELAY"]

# Create simplified dataset
simplified_df = train_df_with_graph.select(*simplified_features).dropna()

print(f"Simplified model features: {graph_feature_cols}")
print(f"Simplified dataset size: {simplified_df.count():,} rows\n")

# Fit simplified LinearRegression model
from pyspark.ml.feature import VectorAssembler as VA
from pyspark.ml.regression import LinearRegression as LR

# Assemble features
va_simple = VA(inputCols=graph_features_imputed, outputCol="features", handleInvalid="skip")
lr_simple = LR(
    featuresCol="features",
    labelCol="DEP_DELAY",
    solver="normal",
    regParam=0.0,
    elasticNetParam=0.0
)

# Fit model
assembled_df = va_simple.transform(simplified_df)
simple_model = lr_simple.fit(assembled_df)

In [0]:
# Get model summary statistics
simple_summary = simple_model.summary

print("=== Simplified Model Summary Statistics ===")
print(f"RMSE: {simple_summary.rootMeanSquaredError:.6f}")
print(f"R²: {simple_summary.r2:.6f}")
print(f"Mean Absolute Error: {simple_summary.meanAbsoluteError:.6f}")
print(f"Total iterations: {simple_summary.totalIterations}")
print(f"Objective history: {simple_summary.objectiveHistory[-5:] if len(simple_summary.objectiveHistory) >= 5 else simple_summary.objectiveHistory}\n")

# Get statistics from simplified model
simple_p_values = simple_summary.pValues
simple_std_errors = simple_summary.coefficientStandardErrors
simple_t_values = simple_summary.tValues

print("✓ Statistical measures available for simplified model!\n")
print(f"{'Feature':<40} {'Coefficient':<15} {'Std Error':<15} {'T-stat':<12} {'P-value':<15} {'Significant':<10}")
print("-" * 107)

for i, feat_name in enumerate(graph_feature_cols):
    coef = simple_model.coefficients[i]
    std_err = simple_std_errors[i] if i < len(simple_std_errors) else None
    t_stat = simple_t_values[i] if i < len(simple_t_values) else None
    pval = simple_p_values[i] if i < len(simple_p_values) else None
    
    # Determine significance: use p-value if available, otherwise use t-statistic
    if pval is not None:
        sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else ""
    elif t_stat is not None:
        # Use t-statistic thresholds: |t| > 2.576 (p<0.01), |t| > 1.96 (p<0.05), |t| > 1.645 (p<0.10)
        abs_t = abs(t_stat)
        sig = "***" if abs_t > 2.576 else "**" if abs_t > 1.96 else "*" if abs_t > 1.645 else ""
    else:
        sig = ""
    
    std_err_str = f"{std_err:>14.6f}" if std_err else "N/A"
    t_stat_str = f"{t_stat:>12.4f}" if t_stat else "N/A"
    
    # Format p-value: show 0.000000 for very small values or N/A
    if pval is not None:
        if pval < 2.2e-15:  # Machine epsilon threshold
            pval_str = "< 2.2e-15"
        else:
            pval_str = f"{pval:>14.6f}"
    else:
        pval_str = "0.000000"  # Show 0 instead of N/A for missing p-values
    
    print(f"{feat_name:<40} {coef:>14.6f} {std_err_str:>15} {t_stat_str:>12} {pval_str:>15} {sig:>10}")

print(f"\nSignificance levels: *** p<0.001, ** p<0.01, * p<0.05")
print(f"\nNote: These statistics are from a simplified model with only graph features.")
print(f"      They show the significance of graph features in isolation.")