In [0]:
# Dependencies
import sys
import pandas as pd

# Load modules
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

## Define Features

In [0]:
categorical_features = [
    'day_of_week',
    'op_carrier',
    'origin', # origin airport code
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
]

## Construct Model Pipeline

In [0]:
# Graph Features Estimator (builds graph and adds PageRank features)

imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY",
    solver="normal",  # Required for p-values and standard errors
    regParam=0.0,  # Required for statistical significance measures
    elasticNetParam=0.0,
)

# Pipeline with graph features
lr_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, scaler, lr])

## Run Cross-Validation with Graph Features

In [0]:
# Train on overlapping folds
cv_obj_overlapping = cv.FlightDelayCV(
    estimator=lr_pipe,
    version="3M",
    fold_strategy="overlapping",
    n_folds=3, 
    train_window_sections=2,
    n_sections=5
)
cv_obj_overlapping.fit()

## View Cross-Validation Results

In [0]:
print("Cross-Validation Results:")
print(cv_obj_overlapping.metrics)
print("\nMean metrics across folds:")
print(pd.DataFrame(cv_obj_overlapping.metrics).mean())