#XGBoost Preloaded Graph, Meta - Custom Join Data

In [0]:
# Dependencies
import sys
import pandas as pd

# Spark Session (Databricks provides this automatically, but we'll create it explicitly for compatibility)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


# Load modules from our Databricks repo
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)


from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, SQLTransformer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline
from xgboost.spark import SparkXGBRegressor

In [0]:
data_loader_otpw = cv.FlightDelayDataLoader(source="OTPW", suffix="_with_graph_and_metamodels")
data_loader_otpw.load()  # Must call load() manually when providing dataloader

In [0]:
categorical_features = [
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    'month', # cyclical patterns
]

# Numerical features (graph features will be added by estimator)
numerical_features = [
    # ============================================================================
    # Flight Lineage Features (pre-applied in split.py)
    # ============================================================================
    'lineage_rank',
    'scheduled_lineage_rotation_time_minutes',
    'scheduled_lineage_turnover_time_minutes',
    'prev_flight_scheduled_flight_time_minutes',
    # Data Leakage Free Prior Flight Duration & Delay Features
    'safe_lineage_rotation_time_minutes',
    'safe_required_time_prev_flight_minutes',
    'safe_prev_departure_delay',  # NEW
    'safe_prev_arrival_delay',  # NEW
    'safe_time_since_prev_arrival', # NEW
    'prev_flight_distance',    # From top-performing XGBoost approach
    
    # Cumulative & Aggregated Features (exclude immediate previous flight - much safer!)
    # These look at flights BEFORE the immediate previous flight (flights 1, 2, 3, etc., excluding n-1)
    'lineage_cumulative_delay',  # Sum of delays across flights before immediate previous flight
    'lineage_avg_delay_previous_flights',  # Average delay of flights before immediate previous flight
    'lineage_max_delay_previous_flights',  # Maximum delay of flights before immediate previous flight
    'lineage_num_previous_flights',  # Number of flights before immediate previous flight
    'lineage_expected_flight_time_minutes',  # Expected flight time based on historical patterns
    
    # ============================================================================
    # Meta-Model Predictions (pre-computed by add_meta_model_features.py)
    # ============================================================================
    'predicted_prev_flight_air_time_XGB_1', # Coming Soon!
    'predicted_prev_flight_turnover_time_XGB_1', # Coming Soon!
    'predicted_prev_flight_total_duration_XGB_1', # NEW!
    
    # ============================================================================
    # Weather Variables (Current Origin)
    # ============================================================================
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    
    'crs_elapsed_time',        # Scheduled elapsed time
    'distance',                # Flight distance
    'elevation',               # Airport elevation (if available)


    # ============================================================================
    # Graph Features
    # ============================================================================

    'prev_flight_origin_pagerank_weighted', # New!
    'prev_flight_origin_pagerank_unweighted', # New!
    'origin_pagerank_weighted',
    'origin_pagerank_unweighted',
    'dest_pagerank_weighted',
    'dest_pagerank_unweighted'
]

In [0]:
imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

# StandardScaler (required for LinearRegression)
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

xgb_regressor = SparkXGBRegressor(num_workers=sc.defaultParallelism, label_col="DEP_DELAY", missing=0.0)


In [0]:
xgb_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, xgb_regressor])

In [0]:
# trying preloaded graph, meta features
cv_xgb_preloaded_meta_3M = cv.FlightDelayCV(
    dataloader = data_loader,
    estimator=xgb_pipe,
    version="3M"
)
cv_xgb_preloaded_meta_3M.fit()

In [0]:
cv_xgb_preloaded_meta_3M.evaluate()

In [0]:
# trying preloaded graph, meta features
cv_xgb_preloaded_meta_12M = cv.FlightDelayCV(
    dataloader = data_loader,
    estimator=xgb_pipe,
    version="12M"
)
cv_xgb_preloaded_meta_12M.fit()

In [0]:
cv_xgb_preloaded_meta_12M.evaluate()

In [0]:
# trying preloaded graph, meta features
cv_xgb_preloaded_meta_60M = cv.FlightDelayCV(
    dataloader = data_loader,
    estimator=xgb_pipe,
    version="60M"
)
cv_xgb_preloaded_meta_60M.fit()

In [0]:
cv_xgb_preloaded_meta_60M.evaluate()

In [0]:
cv_xgb_preloaded_meta_60M.evaluate(use_fold_3_val_train=True)

##Run inference with 60M models (need to retrain first though)

In [0]:
# preloaded graph, meta features
cv_xgb_graph_meta_60M = cv.FlightDelayCV(
    dataloader = data_loader,
    estimator=xgb_pipe,
    version="60M"
)
cv_xgb_graph_meta_60M.fit()

In [0]:
cv_xgb_graph_meta_60M.models

In [0]:
cv_xgb_graph_meta_60M.evaluate()

In [0]:
cv_xgb_graph_meta_60M.test_model

In [0]:
folds = data_loader.get_version("60M")

In [0]:
test_df = folds[-1][1]

In [0]:
test_df.count()

In [0]:
date_range_test = test_df.selectExpr(
    "min(FL_DATE) as min_date",
    "max(FL_DATE) as max_date"
)
display(date_range_test)

In [0]:
predictions = cv_xgb_graph_meta_60M.test_model.transform(test_df)

In [0]:
display(predictions)

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

# Define delay bins and labels
bins = [-float("inf"), 0, 15, 30, 60, float("inf")]
labels = ["Early/On time", "1-15 min", "16-30 min", "31-60 min", "60+ min"]

pdf = predictions.select(
    "DEP_DELAY",
    "prediction"
).toPandas()

# Bin the actual delays
pdf["delay_bin"] = pd.cut(
    pdf["DEP_DELAY"],
    bins=bins,
    labels=labels
)

# Group by bin and calculate mean actual and predicted delay
grouped = pdf.groupby("delay_bin").agg(
    mean_dep_delay=("DEP_DELAY", "mean"),
    mean_prediction=("prediction", "mean")
).reset_index()

# Plot
grouped.plot(
    x="delay_bin",
    y=["mean_dep_delay", "mean_prediction"],
    kind="bar",
    figsize=(10, 6)
)
plt.xlabel("Delay Category")
plt.ylabel("Mean Delay (minutes)")
plt.title("Mean Actual and Predicted Delay by Category")
plt.xticks(rotation=45)
plt.legend(["Actual", "Predicted"])
plt.show()

In [0]:
import pandas as pd
import matplotlib.pyplot as plt

# Create bins: every 15 min up to 180 min, then one for >180 min
bins = [-float("inf")] + [i for i in range(0, 181, 15)] + [float("inf")]
labels = [f"{bins[i]+1:.0f}-{bins[i+1]:.0f} min" for i in range(1, len(bins)-2)]
labels = ["Early/On time"] + labels + ["180+ min"]

pdf = predictions.select(
    "DEP_DELAY",
    "prediction"
).toPandas()

pdf["delay_bin"] = pd.cut(
    pdf["DEP_DELAY"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

grouped = pdf.groupby("delay_bin").agg(
    mean_dep_delay=("DEP_DELAY", "mean"),
    mean_prediction=("prediction", "mean")
).reset_index()

grouped.plot(
    x="delay_bin",
    y=["mean_dep_delay", "mean_prediction"],
    kind="bar",
    figsize=(12, 6)
)
plt.xlabel("Delay Category")
plt.ylabel("Mean Delay (minutes)")
plt.title("Mean Actual and Predicted Delay by Category")
plt.xticks(rotation=45)
plt.legend(["Actual", "Predicted"])
plt.show()

## TODO add log graph and add to GitHub

##Trying OTPW data

In [0]:
# trying preloaded graph, meta features
cv_xgb_otpw_meta_3M = cv.FlightDelayCV(
    dataloader = data_loader_otpw,
    estimator=xgb_pipe,
    version="3M"
)
cv_xgb_otpw_meta_3M.fit()

In [0]:
cv_xgb_otpw_meta_3M.evaluate()

## Try some hyperparameter search

In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

# Define a grid of hyperparameters to test:
#  - maxDepth: maximum depth of each decision tree 
#  - maxIter: iterations, or the total number of trees 
paramGrid = ParamGridBuilder()\
  .addGrid(xgb_regressor.max_depth, [2, 5])\
  .addGrid(xgb_regressor.n_estimators, [10, 100])\
  .build()

# Define an evaluation metric.  The CrossValidator compares the true labels with predicted values for each combination of parameters, and calculates this value to determine the best model.
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=xgb_regressor.getLabelCol(),
                                predictionCol=xgb_regressor.getPredictionCol())

# Declare the CrossValidator, which performs the model tuning.
cv_hp = CrossValidator(estimator=xgb_regressor, evaluator=evaluator, estimatorParamMaps=paramGrid)

In [0]:
# xgb hyperparameter pipe
xgb_hp_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, cv_hp])

In [0]:
# trying grid search
cv_xgb_gridsearch_meta_3M = cv.FlightDelayCV(
    dataloader = data_loader,
    estimator=xgb_hp_pipe,
    version="3M"
)
cv_xgb_gridsearch_meta_3M.fit()