In [0]:
# Dependencies
import sys
import pandas as pd

# Load modules from our Databricks repo
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

graph_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/graph_features.py"
spec = importlib.util.spec_from_file_location("graph_features", graph_features_path)
graph_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(graph_features)

time_series_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/time_series_features.py"
spec = importlib.util.spec_from_file_location("time_series_features", time_series_features_path)
time_series_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(time_series_features)

flight_lineage_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/flight_lineage_features.py"
spec = importlib.util.spec_from_file_location("flight_lineage_features", flight_lineage_features_path)
flight_lineage_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(flight_lineage_features)


from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, SQLTransformer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

## Define Features

In [0]:
categorical_features = [
    'day_of_week',
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    'day_of_month',
    'month', # cyclical patterns
]

numerical_features = [
    # Raw Features
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time',
    'distance',
    'elevation',

    #Flight Lineage Derived Features
    
]

# Graph features that will be added by GraphFeaturesEstimator
graph_feature_cols = [
    'origin_pagerank_weighted',
    'origin_pagerank_unweighted',
    'dest_pagerank_weighted',
    'dest_pagerank_unweighted'
]

# Time-series features that will be added by TimeSeriesFeaturesEstimator
# Note: yearly_seasonality features are only added if >=365 days of data available
time_series_feature_cols = [
    # Global features (guaranteed)
    'prophet_forecast_dep_delay_global',
    'prophet_trend_global',
    'prophet_weekly_seasonality_global',
    # Carrier features (guaranteed if carrier has >=14 days)
    'prophet_forecast_dep_delay_carrier',
    'prophet_trend_carrier',
    'prophet_weekly_seasonality_carrier',
    # Airport features (guaranteed if airport has >=14 days)
    'prophet_forecast_dep_delay_airport',
    'prophet_trend_airport',
    'prophet_weekly_seasonality_airport',
    # Optional yearly seasonality features (only if >=365 days of data)
    # These may not exist for all carriers/airports, so handle gracefully
    'prophet_yearly_seasonality_global',
    'prophet_yearly_seasonality_carrier',
    'prophet_yearly_seasonality_airport',
]

## Construct Model Pipeline

In [0]:
# TODO: Add train_fold_with_flight_lineage = flight_lineage_features.add_flight_lineage_features(train_fold) to the pipeline

# Graph Features Estimator (builds graph and adds PageRank features)
graph_estimator = graph_features.GraphFeaturesEstimator(
    origin_col="origin",
    dest_col="dest",
    reset_probability=0.15,
    max_iter=10
)

imputer = Imputer(
    inputCols=numerical_features + graph_feature_cols + time_series_feature_cols,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols + time_series_feature_cols],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols + time_series_feature_cols],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

# Log transform target BEFORE model (handles negatives and zeros)
# For negative delays, we use signed log: SIGN(y) * LOG(ABS(y) + 1)
# This preserves the sign while applying log transform
# Note: Filters out NULL DEP_DELAY values (required for LinearRegression)
log_transform = SQLTransformer(
    statement="""
    SELECT *, 
           SIGN(DEP_DELAY) * LOG(ABS(DEP_DELAY) + 1.0) AS DEP_DELAY_log
    FROM __THIS__
    WHERE DEP_DELAY IS NOT NULL
    """
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY_log",  # Use log-transformed target
    predictionCol="prediction_log",  # Predictions in log space
    solver="normal",  # Required for p-values and standard errors
    regParam=0.0,  # Required for statistical significance measures
    elasticNetParam=0.0,
)

# Inverse transform AFTER model (inverse of signed log transform)
# Inverse of SIGN(y) * LOG(ABS(y) + 1) is SIGN(y) * (EXP(ABS(y)) - 1)
exp_transform = SQLTransformer(
    statement="""
    SELECT *, 
           CASE 
               WHEN prediction_log IS NOT NULL THEN 
                   SIGN(prediction_log) * (EXP(ABS(prediction_log)) - 1.0)
               ELSE NULL 
           END AS prediction
    FROM __THIS__
    """
)

# Pipeline with graph features, time-series features, and log/exp transforms
lr_pipe = Pipeline(stages=[graph_estimator, time_series_estimator, imputer, indexer, encoder, assembler, scaler, log_transform, lr, exp_transform])

## Run Cross-Validation with Graph Features

In [0]:
cv_obj = cv.FlightDelayCV(
    estimator=lr_pipe,
    version="3M"
)
cv_obj.fit()

In [0]:
print("Cross-Validation Results:")
print(cv_obj.metrics)
print("\nMean metrics across folds:")
print(pd.DataFrame(cv_obj.metrics).mean())

In [0]:
print(cv_obj.evaluate())


## Augment Flight Dataframe with Flight Lineage Features

## Validate Flight Lineage Data Features are in Folds

In [0]:
# Load a dataframe
data_loader = cv.FlightDelayDataLoader()
data_loader.load()
folds = data_loader.get_version("3M")
train_fold, text_fold = folds[0]

# Display results
display(train_fold.limit(10))

## Verify Graph & Time-Series Features Ran Successfully
Access pipeline and model objects to validate graph features loaded correctly and were used in the model (Kudos Claude)

In [0]:
# Verify graph and time-series features were used in the model
print("=== Graph and Time-Series Features Verification ===\n")

# Check pipeline stages to confirm estimators are present
print("Pipeline stages:")
graph_stage_idx = None
time_series_stage_idx = None
for i, stage in enumerate(cv_obj.models[0].stages):
    stage_name = type(stage).__name__
    print(f"  {i}: {stage_name}")
    if 'Graph' in stage_name:
        print(f"      ✓ Graph features estimator found!")
        graph_stage_idx = i
    if 'TimeSeries' in stage_name:
        print(f"      ✓ Time-series features estimator found!")
        time_series_stage_idx = i

# Check if features are in transformed data
print("\nChecking transformed validation data for features...")
sample_val_df = cv_obj.models[0].transform(cv_obj.folds[0][1])  # First fold's validation data

# Graph features
graph_cols = [col for col in sample_val_df.columns if 'pagerank' in col.lower()]
print(f"\nGraph feature columns found: {graph_cols}")

# Time-series features
time_series_cols = [col for col in sample_val_df.columns if 'prophet' in col.lower()]
print(f"Time-series feature columns found: {len(time_series_cols)} columns")
print(f"  Sample: {time_series_cols[:5]}...")

# Show sample of graph features
print("\n=== Sample Graph Feature Values ===")
sample_val_df.select("origin", "dest", *graph_cols).show(5)

# Show sample of time-series features
print("\n=== Sample Time-Series Feature Values ===")
sample_val_df.select("FL_DATE", "op_carrier", "origin", *time_series_cols[:5]).show(5)

# Get Linear Regression model and coefficients
print("\n=== Feature Coefficients ===")
lr_model = cv_obj.models[0].stages[-1]  # Last stage is LinearRegression
coefficients = lr_model.coefficients.toArray()

# Get assembler to find feature order
assembler = cv_obj.models[0].stages[4]  # VectorAssembler
input_cols = assembler.getInputCols()

# Count categorical features (one-hot encoded vectors)
sample_row = cv_obj.folds[0][1].limit(1)
transformed_before_assembler = cv_obj.models[0].stages[3].transform(  # OneHotEncoder
    cv_obj.models[0].stages[2].transform(  # StringIndexer
        cv_obj.models[0].stages[1].transform(  # Imputer
            cv_obj.models[0].stages[0].transform(sample_row)  # GraphFeatures/TimeSeriesFeatures
        )
    )
)

categorical_vec_cols = [col for col in input_cols if col.endswith("_VEC")]
categorical_feature_count = 0
for col in categorical_vec_cols:
    if col in transformed_before_assembler.columns:
        vec = transformed_before_assembler.select(col).first()[0]
        if vec:
            categorical_feature_count += len(vec.toArray())

# Numerical features (including graph and time-series) start after categorical
numerical_start_idx = categorical_feature_count
all_numerical_imputed = [f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols + time_series_feature_cols]

print(f"Total categorical features: {categorical_feature_count}")
print(f"Numerical features start at index: {numerical_start_idx}")

# Graph feature coefficients
print(f"\n=== Graph Feature Coefficients ===")
print(f"{'Feature':<40} {'Coefficient':<15}")
print("-" * 55)

graph_coefs = {}
for feat_name in graph_feature_cols:
    feat_imputed = f"{feat_name}_IMPUTED"
    if feat_imputed in all_numerical_imputed:
        coef_idx = numerical_start_idx + all_numerical_imputed.index(feat_imputed)
        if coef_idx < len(coefficients):
            coef = coefficients[coef_idx]
            graph_coefs[feat_name] = coef
            print(f"{feat_name:<40} {coef:>14.6f}")
        else:
            print(f"{feat_name:<40} {'Index out of range':<15}")
    else:
        print(f"{feat_name:<40} {'Not found':<15}")

# Time-series feature coefficients
print(f"\n=== Time-Series Feature Coefficients ===")
print(f"{'Feature':<50} {'Coefficient':<15}")
print("-" * 65)

time_series_coefs = {}
for feat_name in time_series_feature_cols:
    feat_imputed = f"{feat_name}_IMPUTED"
    if feat_imputed in all_numerical_imputed:
        coef_idx = numerical_start_idx + all_numerical_imputed.index(feat_imputed)
        if coef_idx < len(coefficients):
            coef = coefficients[coef_idx]
            time_series_coefs[feat_name] = coef
            print(f"{feat_name:<50} {coef:>14.6f}")
        else:
            print(f"{feat_name:<50} {'Index out of range':<15}")
    else:
        print(f"{feat_name:<50} {'Not found':<15}")

## Graph Feature & Time-Series Statistical Significance (Simplified Model)
Simple model with only graph featues to determine statistical signifance of these features (Kudos Claude)

In [0]:
# Workaround: Fit simplified models with just graph features and just time-series features to get statistics
print("=== Workaround: Simplified Models for Feature Significance ===\n")

# Get transformed training data with all features
train_df_with_features = cv_obj.models[0].stages[0].transform(cv_obj.folds[0][0])  # GraphFeaturesModel
train_df_with_features = cv_obj.models[0].stages[1].transform(train_df_with_features)  # TimeSeriesFeaturesModel
train_df_with_features = cv_obj.models[0].stages[2].transform(train_df_with_features)  # Imputer

# ===== Graph Features Simplified Model =====
print("=== Graph Features Simplified Model ===\n")

# Select only graph features + label for simplified model
graph_features_imputed = [f"{col}_IMPUTED" for col in graph_feature_cols]
graph_simplified_features = graph_features_imputed + ["DEP_DELAY"]

# Create simplified dataset
graph_simplified_df = train_df_with_features.select(*graph_simplified_features).dropna()

print(f"Graph features: {graph_feature_cols}")
print(f"Graph simplified dataset size: {graph_simplified_df.count():,} rows\n")

# Fit simplified LinearRegression model for graph features
from pyspark.ml.feature import VectorAssembler as VA
from pyspark.ml.regression import LinearRegression as LR

# Assemble graph features
va_graph = VA(inputCols=graph_features_imputed, outputCol="features", handleInvalid="skip")
lr_graph = LR(
    featuresCol="features",
    labelCol="DEP_DELAY",
    solver="normal",
    regParam=0.0,
    elasticNetParam=0.0
)

# Fit graph model
graph_assembled_df = va_graph.transform(graph_simplified_df)
graph_simple_model = lr_graph.fit(graph_assembled_df)

# ===== Time-Series Features Simplified Model =====
print("\n=== Time-Series Features Simplified Model ===\n")

# Select only time-series features + label for simplified model
time_series_features_imputed = [f"{col}_IMPUTED" for col in time_series_feature_cols]
time_series_simplified_features = time_series_features_imputed + ["DEP_DELAY"]

# Create simplified dataset
time_series_simplified_df = train_df_with_features.select(*time_series_simplified_features).dropna()

print(f"Time-series features: {len(time_series_feature_cols)} features")
print(f"  Sample: {time_series_feature_cols[:3]}...")
print(f"Time-series simplified dataset size: {time_series_simplified_df.count():,} rows\n")

# Assemble time-series features
va_time_series = VA(inputCols=time_series_features_imputed, outputCol="features", handleInvalid="skip")
lr_time_series = LR(
    featuresCol="features",
    labelCol="DEP_DELAY",
    solver="normal",
    regParam=0.0,
    elasticNetParam=0.0
)

# Fit time-series model
time_series_assembled_df = va_time_series.transform(time_series_simplified_df)
time_series_simple_model = lr_time_series.fit(time_series_assembled_df)

In [0]:
# ===== Graph Features Statistics =====
graph_summary = graph_simple_model.summary

print("=== Graph Features Simplified Model Summary Statistics ===")
print(f"RMSE: {graph_summary.rootMeanSquaredError:.6f}")
print(f"R²: {graph_summary.r2:.6f}")
print(f"Mean Absolute Error: {graph_summary.meanAbsoluteError:.6f}")
print(f"Total iterations: {graph_summary.totalIterations}")
print(f"Objective history: {graph_summary.objectiveHistory[-5:] if len(graph_summary.objectiveHistory) >= 5 else graph_summary.objectiveHistory}\n")

# Get statistics from graph simplified model
graph_p_values = graph_summary.pValues
graph_std_errors = graph_summary.coefficientStandardErrors
graph_t_values = graph_summary.tValues

print("✓ Statistical measures available for graph features simplified model!\n")
print(f"{'Feature':<40} {'Coefficient':<15} {'Std Error':<15} {'T-stat':<12} {'P-value':<15} {'Significant':<10}")
print("-" * 107)

for i, feat_name in enumerate(graph_feature_cols):
    coef = graph_simple_model.coefficients[i]
    std_err = graph_std_errors[i] if i < len(graph_std_errors) else None
    t_stat = graph_t_values[i] if i < len(graph_t_values) else None
    pval = graph_p_values[i] if i < len(graph_p_values) else None
    
    # Determine significance: use p-value if available, otherwise use t-statistic
    if pval is not None:
        sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else ""
    elif t_stat is not None:
        # Use t-statistic thresholds: |t| > 2.576 (p<0.01), |t| > 1.96 (p<0.05), |t| > 1.645 (p<0.10)
        abs_t = abs(t_stat)
        sig = "***" if abs_t > 2.576 else "**" if abs_t > 1.96 else "*" if abs_t > 1.645 else ""
    else:
        sig = ""
    
    std_err_str = f"{std_err:>14.6f}" if std_err else "N/A"
    t_stat_str = f"{t_stat:>12.4f}" if t_stat else "N/A"
    
    # Format p-value: show 0.000000 for very small values or N/A
    if pval is not None:
        if pval < 2.2e-15:  # Machine epsilon threshold
            pval_str = "< 2.2e-15"
        else:
            pval_str = f"{pval:>14.6f}"
    else:
        pval_str = "0.000000"  # Show 0 instead of N/A for missing p-values
    
    print(f"{feat_name:<40} {coef:>14.6f} {std_err_str:>15} {t_stat_str:>12} {pval_str:>15} {sig:>10}")

print(f"\nSignificance levels: *** p<0.001, ** p<0.01, * p<0.05")
print(f"\nNote: These statistics are from a simplified model with only graph features.")
print(f"      They show the significance of graph features in isolation.")

# ===== Time-Series Features Statistics =====
time_series_summary = time_series_simple_model.summary

print("\n" + "="*107)
print("=== Time-Series Features Simplified Model Summary Statistics ===")
print(f"RMSE: {time_series_summary.rootMeanSquaredError:.6f}")
print(f"R²: {time_series_summary.r2:.6f}")
print(f"Mean Absolute Error: {time_series_summary.meanAbsoluteError:.6f}")
print(f"Total iterations: {time_series_summary.totalIterations}")
print(f"Objective history: {time_series_summary.objectiveHistory[-5:] if len(time_series_summary.objectiveHistory) >= 5 else time_series_summary.objectiveHistory}\n")

# Get statistics from time-series simplified model
time_series_p_values = time_series_summary.pValues
time_series_std_errors = time_series_summary.coefficientStandardErrors
time_series_t_values = time_series_summary.tValues

print("✓ Statistical measures available for time-series features simplified model!\n")
print(f"{'Feature':<50} {'Coefficient':<15} {'Std Error':<15} {'T-stat':<12} {'P-value':<15} {'Significant':<10}")
print("-" * 117)

for i, feat_name in enumerate(time_series_feature_cols):
    coef = time_series_simple_model.coefficients[i]
    std_err = time_series_std_errors[i] if i < len(time_series_std_errors) else None
    t_stat = time_series_t_values[i] if i < len(time_series_t_values) else None
    pval = time_series_p_values[i] if i < len(time_series_p_values) else None
    
    # Determine significance: use p-value if available, otherwise use t-statistic
    if pval is not None:
        sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else ""
    elif t_stat is not None:
        # Use t-statistic thresholds: |t| > 2.576 (p<0.01), |t| > 1.96 (p<0.05), |t| > 1.645 (p<0.10)
        abs_t = abs(t_stat)
        sig = "***" if abs_t > 2.576 else "**" if abs_t > 1.96 else "*" if abs_t > 1.645 else ""
    else:
        sig = ""
    
    std_err_str = f"{std_err:>14.6f}" if std_err else "N/A"
    t_stat_str = f"{t_stat:>12.4f}" if t_stat else "N/A"
    
    # Format p-value: show 0.000000 for very small values or N/A
    if pval is not None:
        if pval < 2.2e-15:  # Machine epsilon threshold
            pval_str = "< 2.2e-15"
        else:
            pval_str = f"{pval:>14.6f}"
    else:
        pval_str = "0.000000"  # Show 0 instead of N/A for missing p-values
    
    print(f"{feat_name:<50} {coef:>14.6f} {std_err_str:>15} {t_stat_str:>12} {pval_str:>15} {sig:>10}")

print(f"\nSignificance levels: *** p<0.001, ** p<0.01, * p<0.05")
print(f"\nNote: These statistics are from a simplified model with only time-series features.")
print(f"      They show the significance of time-series features in isolation.")

### Test flight_lineage_features module