In [None]:
# Imports and Spark session
from pyspark.sql import SparkSession, functions as F
import pandas as pd

# Import the loader & evaluator from this repo
from cv import FlightDelayDataLoader, FlightDelayEvaluator

# Create / get Spark session (works on Databricks)

spark = SparkSession.builder.getOrCreate()
print('Spark:', spark)

In [None]:
# Configure and load folds
# Adjust `folder_path` if you saved folds to a different location
loader = FlightDelayDataLoader(
    folder_path="dbfs:/student-groups/Group_4_2",
    n_folds=3,           # matches split.py default (3 CV folds + 1 test)
    source="CUSTOM"   # or 'PROVIDED' if you saved OTPW_PROVIDED_* files
)
loader.load()

version = '3M'
folds = loader.get_version(version)
print(f'Loaded {len(folds)} folds for version={version}')
for i, (t, v) in enumerate(folds):
    print(f' Fold {i+1}: train={t.count()}, val={v.count()}')

In [None]:
# Run median predictor on CV folds (exclude final test fold)
evaluator = FlightDelayEvaluator()
metrics = []

for idx, (train_df, val_df) in enumerate(folds[:-1]):
    # Ensure DEP_DELAY is numeric
    if 'DEP_DELAY' not in train_df.columns:
        raise RuntimeError('DEP_DELAY column missing from training data')

    train_df = train_df.withColumn('DEP_DELAY', F.col('DEP_DELAY').cast('double'))
    # Compute median using approxQuantile (fast and safe for large datasets)
    median = train_df.approxQuantile('DEP_DELAY', [0.5], 0.001)[0]
    print(f'Fold {idx+1} train median DEP_DELAY = {median}')

    # Create constant prediction on validation set
    val_pred = val_df.withColumn('prediction', F.lit(float(median)))

    # Evaluate and collect metrics
    metric = evaluator.evaluate(val_pred)
    metric['fold'] = idx + 1
    metrics.append(metric)

# Convert to pandas for nicer display
metrics_pd = pd.DataFrame(metrics).set_index('fold') if metrics else pd.DataFrame()
print('
Cross-validation fold metrics (median predictor):')
print(metrics_pd)

In [None]:
# Evaluate on final test fold (last tuple)
train_combined, test_df = folds[-1]
train_combined = train_combined.withColumn('DEP_DELAY', F.col('DEP_DELAY').cast('double'))
median = train_combined.approxQuantile('DEP_DELAY', [0.5], 0.001)[0]
test_pred = test_df.withColumn('prediction', F.lit(float(median)))
test_metric = evaluator.evaluate(test_pred)
print('
Test fold metric (median predictor):')
print(test_metric)

### Next steps
- Replace the median predictor with a real `Pipeline` (feature assembler + estimator).
- Use `FlightDelayCV` wrapper to train Spark ML estimators if you prefer automated fold training.
- Add logging or write metrics to a CSV for experiment tracking.