In [0]:
"""
cv.py (simplified, CUSTOM-only, no parametrization)

Assumptions:
- Folds were created from split.py with N_FOLDS = 3 and CREATE_TEST_FOLD = True
- Therefore total fold indices written = 4:
    FOLD_1_VAL, FOLD_2_VAL, FOLD_3_VAL, FOLD_4_TEST
- Files live in:
    dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed
- File naming:
    OTPW_CUSTOM_{VERSION}_FOLD_{i}_{TRAIN|VAL|TEST}.parquet
"""

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import mlflow
mlflow.autolog(disable=True)

# -----------------------------
# HARD-CODED GLOBALS
# -----------------------------
FOLDER_PATH = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed"
SOURCE = "CUSTOM"
VERSIONS = ["3M", "12M"]

# 3 CV folds + 1 test fold = 4 total fold indices
TOTAL_FOLDS = 4


class FlightDelayDataLoader:
    """
    CUSTOM-only loader that guarantees all numerical features are cast to double.
    """

    def __init__(self):
        self.folder_path = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed"
        self.source = "CUSTOM"
        self.folds = {}
        self.versions = ["3M", "12M"]

        self.numerical_features = [
            'hourlyprecipitation',
            'hourlysealevelpressure',
            'hourlyaltimetersetting',
            'hourlywetbulbtemperature',
            'hourlystationpressure',
            'hourlywinddirection',
            'hourlyrelativehumidity',
            'hourlywindspeed',
            'hourlydewpointtemperature',
            'hourlydrybulbtemperature',
            'hourlyvisibility',
            'crs_elapsed_time', # scheduled flight time
            'quarter', # inferred from month
            'flights', # number of flights? always 1?
            'distance', # flight distance, probably important
            'year', # excluded bc new predictions will always be in a new year
            # latitude and longitude not very useful in linear regression
            'origin_station_lat',
            'origin_station_lon',
            'origin_airport_lat',
            'origin_airport_lon',
            'origin_station_dis',
            'dest_station_lat',
            'dest_station_lon',
            'dest_airport_lat',
            'dest_airport_lon',
            'dest_station_dis',
            'latitude',
            'longitude',
            'elevation',
        ]

    def _cast_numerics(self, df):
        """
        Safely cast all configured numeric columns to doubles.
        Handles common bad values like '', 'NA', 'M', 'T', '.', etc.
        """

        # Patterns that should be treated as null
        NULL_PAT = r'^(NA|N/A|NULL|null|None|none|\\N|\\s*|\\.|M|T)$'

        for colname in self.numerical_features:
            if colname in df.columns:
                df = df.withColumn(
                    colname,
                    F.regexp_replace(F.col(colname).cast("string"), NULL_PAT, "")
                    .cast("double")
                )

        # Explicitly cast labels to expected numeric types
        if "DEP_DELAY" in df.columns:
            df = df.withColumn("DEP_DELAY", F.col("DEP_DELAY").cast("double"))
        if "DEP_DEL15" in df.columns:
            df = df.withColumn("DEP_DEL15", F.col("DEP_DEL15").cast("int"))
        if "SEVERE_DEL60" in df.columns:
            df = df.withColumn("SEVERE_DEL60", F.col("SEVERE_DEL60").cast("int"))

        return df

    def _load_parquet(self, name):
        spark = SparkSession.builder.getOrCreate()
        df = spark.read.parquet(f"{self.folder_path}/{name}.parquet")
        df = self._cast_numerics(df)
        return df

    def _load_version(self, version):
        folds = []
        for fold_idx in range(1, 4 + 1):  # 3 CV folds + 1 test
            train_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_TRAIN"
            train_df = self._load_parquet(train_name)

            if fold_idx < 4:
                val_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_VAL"
                val_df = self._load_parquet(val_name)
                folds.append((train_df, val_df))
            else:
                test_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_TEST"
                test_df = self._load_parquet(test_name)
                folds.append((train_df, test_df))

        return folds

    def load(self):
        for version in self.versions:
            self.folds[version] = self._load_version(version)

    def get_version(self, version):
        return self.folds[version]

# -----------------------------
# EVALUATOR (NULL-SAFE RMSE)
# -----------------------------
class FlightDelayEvaluator:
    def __init__(
        self,
        prediction_col="prediction",
        numeric_label_col="DEP_DELAY",
        binary_label_col="DEP_DEL15",
        severe_label_col="SEVERE_DEL60",
    ):
        self.prediction_col = prediction_col
        self.numeric_label_col = numeric_label_col
        self.binary_label_col = binary_label_col
        self.severe_label_col = severe_label_col

        self.rmse_evaluator = RegressionEvaluator(
            predictionCol=prediction_col,
            labelCol=numeric_label_col,
            metricName="rmse"
        )

    def calculate_rmse(self, predictions_df):
        # Drop any residual nulls before RegressionEvaluator sees them
        clean = predictions_df.dropna(
            subset=[self.numeric_label_col, self.prediction_col]
        )
        return self.rmse_evaluator.evaluate(clean)

    def _calculate_classification_metrics(self, predictions_df, threshold, label_col):
        # Null-safe for classification too
        df = predictions_df.dropna(subset=[self.prediction_col, label_col])

        pred_binary_col = f"pred_binary_{threshold}"
        df = df.withColumn(
            pred_binary_col,
            F.when(F.col(self.prediction_col) >= threshold, 1).otherwise(0)
        )

        tp = df.filter((F.col(pred_binary_col) == 1) & (F.col(label_col) == 1)).count()
        fp = df.filter((F.col(pred_binary_col) == 1) & (F.col(label_col) == 0)).count()
        tn = df.filter((F.col(pred_binary_col) == 0) & (F.col(label_col) == 0)).count()
        fn = df.filter((F.col(pred_binary_col) == 0) & (F.col(label_col) == 1)).count()

        total = tp + fp + tn + fn
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
        accuracy = (tp + tn) / total if total else 0.0

        return dict(tp=tp, fp=fp, tn=tn, fn=fn,
                    precision=precision, recall=recall, f1=f1, accuracy=accuracy)

    def calculate_otpa_metrics(self, predictions_df):
        return self._calculate_classification_metrics(
            predictions_df, threshold=15, label_col=self.binary_label_col
        )["accuracy"]

    def calculate_sddr_metrics(self, predictions_df):
        return self._calculate_classification_metrics(
            predictions_df, threshold=60, label_col=self.severe_label_col
        )["recall"]

    def evaluate(self, predictions_df):
        return {
            "rmse": self.calculate_rmse(predictions_df),
            "otpa": self.calculate_otpa_metrics(predictions_df),
            "sddr": self.calculate_sddr_metrics(predictions_df),
        }


# -----------------------------
# CROSS-VALIDATOR (NO PARAMS)
# -----------------------------
class FlightDelayCV:
    def __init__(self, estimator, dataloader, version):
        self.estimator = estimator
        self.version = version

        if dataloader:
            self.data_loader = dataloader
        else:
            self.data_loader = FlightDelayDataLoader()
            self.data_loader.load()

        self.evaluator = FlightDelayEvaluator()
        self.folds = self.data_loader.get_version(version)

        self.metrics = []
        self.models = []
        self.test_metric = None
        self.test_model = None

    def fit(self):
        # CV folds only (exclude last test fold)
        for train_df, val_df in self.folds[:-1]:
            model = self.estimator.fit(train_df)
            preds = model.transform(val_df)

            metric = self.evaluator.evaluate(preds)
            self.metrics.append(metric)
            self.models.append(model)

        m = pd.DataFrame(self.metrics)
        m.loc["mean"] = m.mean()
        m.loc["std"] = m.std()
        return m

    def evaluate(self):
        train_df, test_df = self.folds[-1]
        self.test_model = self.estimator.fit(train_df)
        preds = self.test_model.transform(test_df)
        self.test_metric = self.evaluator.evaluate(preds)
        return self.test_metric

In [0]:
data_loader = FlightDelayDataLoader()
data_loader.load()

In [0]:
# =====================================================
# MLP REGRESSOR INTEGRATED WITH FLIGHTDELAYCV
# =====================================================

from pyspark.sql import SparkSession, functions as F, Window
from pyspark.ml.feature import (
    Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.sql.types import DoubleType
from pyspark.ml import Pipeline, PipelineModel
import tensorflow as tf
from tensorflow import keras
# from horovod.spark.keras import HorovodRunner
from tensorflow.keras import layers, regularizers, callbacks
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


# =====================================================
# 1. MLP ESTIMATOR (Spark-compatible interface)
# =====================================================


from pyspark.ml.functions import vector_to_array
import numpy as np
import tensorflow as tf

class RMSE(tf.keras.metrics.Metric):
    def __init__(self, name="rmse", **kwargs):
        super().__init__(name=name, **kwargs)
        self.mse = tf.keras.metrics.MeanSquaredError()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.mse.update_state(y_true, y_pred, sample_weight)

    def result(self):
        return tf.sqrt(self.mse.result())

    def reset_state(self):
        self.mse.reset_state()

class SparkMLPRegressor:
    def __init__(self, hidden_layers=None, dropout_rate=0.3, learning_rate=0.001, 
                 batch_size=256, epochs=1, early_stopping=True, patience=10, verbose=1): 
                 # NOTE: epochs set to 1 here, we will loop through data manually.
        
        self.hidden_layers = hidden_layers or [128, 64]
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs # Total passes over data (external loop)
        self.early_stopping = early_stopping
        self.patience = patience
        self.verbose = verbose
        
        self.model = None
        self.input_dim = None
    
    import tensorflow as tf



    def _build_model(self, input_dim):

        ##############################################################
        # >>> NEW GPU CONFIGURATION BLOCK <<<
        ##############################################################
        try:
            # Check for available GPUs
            gpus = tf.config.experimental.list_physical_devices('GPU')
            if gpus:
                # Set memory growth to prevent the process from taking all GPU memory
                tf.config.experimental.set_memory_growth(gpus[0], True)
                print(f"Successfully configured TensorFlow to use GPU: {gpus[0].name}")
            else:
                print("No GPU devices found. Training will proceed on CPU.")
        except Exception as e:
            # Catch errors if tf.config is accessed too late or other issues
            print(f"Error setting up GPU configuration: {e}")
        ##############################################################


        self.input_dim = input_dim
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Input(shape=(input_dim,)))
        
        # ... (layers, batch norm, dropout)
        model.add(tf.keras.layers.Dense(self.hidden_layers[0], kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.Activation('relu'))
        model.add(tf.keras.layers.Dropout(self.dropout_rate))
        
        for units in self.hidden_layers[1:]:
            model.add(tf.keras.layers.Dense(units, kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
            model.add(tf.keras.layers.BatchNormalization())
            model.add(tf.keras.layers.Activation('relu'))
            model.add(tf.keras.layers.Dropout(self.dropout_rate))
            
        model.add(tf.keras.layers.Dense(1)) 
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        # We compile with a loss, but we will use train_on_batch, not model.fit
        model.compile(optimizer=optimizer, loss='mse', metrics=[RMSE()])
        self.model = model
        return model

    def fit(self, df):
        """
        FIXED: Uses rdd.toLocalIterator() for sequential, chunked training on the driver.
        This avoids the OOM crash.
        """
        print("Starting sequential training on driver in small batches...")
        
        # 1. Prepare data stream: Convert Vector to Array and filter nulls/nans
        df_arrays = df.select(
            vector_to_array("scaled_features").alias("features_arr"), 
            "DEP_DELAY"
        )
        df_arrays = df_arrays.filter(
            F.col("features_arr").isNotNull() & F.col("DEP_DELAY").isNotNull()
        )
        df_arrays = df_arrays.filter(F.size(F.col("features_arr")) > 0)
        
        # Get the input dimension (need to get one row to initialize model)
        try:
            sample_row = df_arrays.head()
            self.input_dim = len(sample_row["features_arr"])
        except Exception as e:
            raise ValueError(f"Failed to get feature dimension: {e}")

        if self.model is None:
            self._build_model(self.input_dim)

        # 2. Setup training loop
        # We use a simple counter for manual epoch tracking.
        current_patience = 0
        best_loss = float('inf')
        
        for epoch in range(self.epochs):
            print(f"--- Epoch {epoch + 1}/{self.epochs} ---")
            
            # Get an iterator that pulls data from workers without OOM on driver
            data_iterator = df_arrays.rdd.toLocalIterator()
            
            features_buffer = []
            labels_buffer = []
            total_loss = 0
            total_batches = 0
            
            for row in data_iterator:
                # 3. Accumulate rows into a batch
                features_buffer.append(row["features_arr"])
                labels_buffer.append(row["DEP_DELAY"])
                
                if len(features_buffer) >= self.batch_size:
                    # 4. Convert buffered lists to NumPy arrays
                    X_batch = np.stack(features_buffer)
                    y_batch = np.array(labels_buffer)
                    
                    # 5. Train on the batch and get loss/metrics
                    metrics = self.model.train_on_batch(X_batch, y_batch)
                    total_loss += metrics[0]
                    total_batches += 1
                    
                    # Clear the buffers for the next batch
                    features_buffer = []
                    labels_buffer = []
            
            # Handle the final partial batch, if any
            if len(features_buffer) > 0:
                X_batch = np.stack(features_buffer)
                y_batch = np.array(labels_buffer)
                metrics = self.model.train_on_batch(X_batch, y_batch)
                total_loss += metrics[0]
                total_batches += 1

            # Calculate and log mean loss for the epoch
            epoch_loss = total_loss / total_batches if total_batches > 0 else 0
            if self.verbose:
                print(f"Epoch {epoch + 1} finished. Mean Loss: {epoch_loss:.4f}")

            # 6. Manual Early Stopping check (Simple implementation)
            if self.early_stopping:
                if epoch_loss < best_loss:
                    best_loss = epoch_loss
                    current_patience = 0
                    # NOTE: In a real system, you would save model weights here
                else:
                    current_patience += 1
                    if current_patience >= self.patience:
                        print(f"Early stopping triggered after {epoch + 1} epochs.")
                        break

        print("Sequential training complete.")
        return self
    
    def transform(self, df):
        """
        Uses Iterator UDF (mapInPandas) for parallel inference on workers, 
        loading the Horovod-trained model.
        """
        if self.model is None:
            raise ValueError("Model not fitted.")

        # Serialize model state (for broadcast to workers)
        model_json = self.model.to_json()
        model_weights = self.model.get_weights()
        schema = df.schema.add("prediction", DoubleType())
        
        def predict_partition_full(iterator):
            # 1. Initialize model ONCE per partition
            worker_model = tf.keras.models.model_from_json(model_json)
            worker_model.set_weights(model_weights)
            
            for pdf_batch in iterator:
                # 2. Extract features
                # Use vector_to_array conversion from the previous step
                X_batch = np.stack(pdf_batch["features_arr"].values)
                preds = worker_model.predict(X_batch, verbose=0).flatten()
                
                # 3. Assign and yield
                pdf_batch["prediction"] = preds
                yield pdf_batch.drop(columns=["features_arr"])

        # Add the array column temporarily
        df_with_arr = df.withColumn("features_arr", vector_to_array("scaled_features"))
        
        # Final transform
        return df_with_arr.mapInPandas(predict_partition_full, schema=schema)

    

# =====================================================
# 2. MLP PIPELINE WRAPPER
# =====================================================

class MLPFlightDelayPipeline:
    """
    Wrapper that combines Spark preprocessing + MLP into a single estimator.
    Compatible with FlightDelayCV.
    """
    
    def __init__(
        self,
        categorical_features,
        numerical_features,
        mlp_params=None,
    ):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.mlp_params = mlp_params or {}
        
        self.preprocessing_pipeline = None
        self.mlp_regressor = None
        
    def _build_preprocessing_pipeline(self):
        """
        Build Spark ML pipeline for feature preprocessing.
        """
        imputer = Imputer(
            inputCols=self.numerical_features,
            outputCols=[f"{col}_IMPUTED" for col in self.numerical_features],
            strategy="mean"
        )
        
        indexer = StringIndexer(
            inputCols=self.categorical_features,
            outputCols=[f"{col}_INDEX" for col in self.categorical_features],
            handleInvalid="keep"
        )
        
        encoder = OneHotEncoder(
            inputCols=[f"{col}_INDEX" for col in self.categorical_features],
            outputCols=[f"{col}_VEC" for col in self.categorical_features],
            dropLast=False
        )
        
        assembler = VectorAssembler(
            inputCols=[f"{col}_VEC" for col in self.categorical_features] + 
                      [f"{col}_IMPUTED" for col in self.numerical_features],
            outputCol="features",
            handleInvalid="skip"
        )
        
        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled_features",
            withMean=True,
            withStd=True
        )
        
        self.preprocessing_pipeline = Pipeline(
            stages=[imputer, indexer, encoder, assembler, scaler]
        )
        
        return self.preprocessing_pipeline
    
    def fit(self, df):
        """
        Fit preprocessing pipeline and MLP on training data.
        Returns self for chaining.
        """
        # Build and fit preprocessing pipeline (only first time)
        if self.preprocessing_pipeline is None:
            self._build_preprocessing_pipeline()
            self.preprocessing_pipeline = self.preprocessing_pipeline.fit(df)
        
        # Transform training data using already-fitted pipeline
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # Build and fit MLP
        self.mlp_regressor = SparkMLPRegressor(**self.mlp_params)
        self.mlp_regressor.fit(preprocessed)
        
        return self
    
    def transform(self, df):
        """
        Preprocess and generate predictions on new data.
        """
        if self.preprocessing_pipeline is None or self.mlp_regressor is None:
            raise ValueError("Pipeline not fitted yet. Call fit() first.")
        
        # Apply preprocessing
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # Generate predictions
        predictions_df = self.mlp_regressor.transform(preprocessed)
        
        return predictions_df


# =====================================================
# 3. FEATURE ENGINEERING (OPTIONAL ENHANCEMENT)
# =====================================================

class FlightDelayFeatureEngineer:
    """
    Optionally add cyclical and interaction features before preprocessing.
    """
    
    def __init__(self, df):
        self.df = df
    
    def add_temporal_features(self):
        """Add sin/cos encoding for cyclical features."""
        self.df = self.df.withColumn(
            "month_sin", F.sin(2 * F.lit(np.pi) * F.col("month") / 12)
        ).withColumn(
            "month_cos", F.cos(2 * F.lit(np.pi) * F.col("month") / 12)
        ).withColumn(
            "day_of_week_sin", F.sin(2 * F.lit(np.pi) * F.col("day_of_week") / 7)
        ).withColumn(
            "day_of_week_cos", F.cos(2 * F.lit(np.pi) * F.col("day_of_week") / 7)
        )
        return self
    
    def add_interaction_features(self):
        """Create interaction terms for MLP to learn."""
        self.df = self.df.withColumn(
            "distance_x_windspeed", F.col("distance") * F.col("hourlywindspeed")
        ).withColumn(
            "distance_x_visibility", F.col("distance") * F.col("hourlyvisibility")
        ).withColumn(
            "pressure_x_humidity",
            F.col("hourlysealevelpressure") * F.col("hourlyrelativehumidity")
        )
        return self
    
    def get(self):
        return self.df


# =====================================================
# 4. USAGE WITH FLIGHTDELAYCV
# =====================================================

# Feature definitions
categorical_features = [
    'day_of_week', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting',
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection',
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature',
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 'distance', 'elevation'
]

# MLP hyperparameters (tunable for your data)
mlp_params = {
    'hidden_layers': [512, 256, 128],  # Much smaller to reduce memory & computation
    'dropout_rate': 0.1,
    'learning_rate': 0.005,
    'batch_size': 128,  # Smaller batch size for training
    'epochs': 10,  # Fewer epochs
    'early_stopping': True,
    'patience': 5,
    'verbose': 1,  # Set to 1 to see training progress
}

# Initialize MLP pipeline
mlp_pipeline = MLPFlightDelayPipeline(
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    mlp_params=mlp_params,
)

# Use with FlightDelayCV (same interface as your Linear Regression and Random Forest)
cv = FlightDelayCV(
    estimator=mlp_pipeline,
    dataloader=data_loader,
    version="12M"
)

# Run cross-validation
cv_results = cv.fit()
print("Cross-Validation Results:")
print(cv_results)

# Evaluate on held-out test fold
test_results = cv.evaluate()
print("\nTest Set Results:")
print(test_results)



In [0]:

# =====================================================
# 6. OPTIONAL: PERFORMANCE DIAGNOSTICS
# =====================================================

def profile_mlp_timing(data_loader, version="12M"):
    """
    Profile where time is spent: data loading, preprocessing, or training.
    """
    import time
    
    train_df, val_df = data_loader.get_version(version)[0]
    
    # Time 1: Preprocessing pipeline build + fit
    print("Timing preprocessing pipeline...")
    start = time.time()
    
    preprocessing_pipeline = Pipeline(stages=[
        Imputer(
            inputCols=numerical_features,
            outputCols=[f"{col}_IMPUTED" for col in numerical_features],
            strategy="mean"
        ),
        StringIndexer(
            inputCols=categorical_features,
            outputCols=[f"{col}_INDEX" for col in categorical_features],
            handleInvalid="keep"
        ),
        OneHotEncoder(
            inputCols=[f"{col}_INDEX" for col in categorical_features],
            outputCols=[f"{col}_VEC" for col in categorical_features],
            dropLast=False
        ),
        VectorAssembler(
            inputCols=[f"{col}_VEC" for col in categorical_features] + 
                      [f"{col}_IMPUTED" for col in numerical_features],
            outputCol="features",
            handleInvalid="skip"
        ),
        StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True),
    ])
    
    fitted_pipeline = preprocessing_pipeline.fit(train_df)
    preprocessed = fitted_pipeline.transform(train_df)
    preprocessed.cache().count()  # Force evaluation
    preprocess_time = time.time() - start
    
    # Time 2: Data extraction to NumPy
    print("Timing data extraction...")
    start = time.time()
    data_rdd = preprocessed.select("scaled_features", "DEP_DELAY").rdd.map(
        lambda row: (row[0].toArray(), float(row[1]) if row[1] is not None else np.nan)
    )
    collected = data_rdd.collect()
    extract_time = time.time() - start
    
    # Time 3: MLP training
    print("Timing MLP training...")
    start = time.time()
    
    mlp = SparkMLPRegressor(**mlp_params)
    mlp.fit(preprocessed)
    
    train_time = time.time() - start
    
    print(f"\n{'='*60}")
    print(f"TIMING BREAKDOWN:")
    print(f"{'='*60}")
    print(f"Preprocessing pipeline: {preprocess_time:.2f}s")
    print(f"Data extraction:       {extract_time:.2f}s")
    print(f"MLP training:          {train_time:.2f}s")
    print(f"Total:                 {preprocess_time + extract_time + train_time:.2f}s")
    print(f"{'='*60}")
    
    return {
        'preprocess_time': preprocess_time,
        'extract_time': extract_time,
        'train_time': train_time,
    }

def grid_search_mlp(data_loader, version="12M"):
    """
    Simple grid search over MLP hyperparameters across CV folds.
    """
    param_grid = [
        {
            'hidden_layers': [256, 128],
            'dropout_rate': 0.2,
            'epochs': 50,
        },
        {
            'hidden_layers': [512, 256, 128, 64],
            'dropout_rate': 0.3,
            'epochs': 100,
        },
        {
            'hidden_layers': [1024, 512, 256, 128, 64],
            'dropout_rate': 0.4,
            'epochs': 150,
        },
    ]
    
    results = []
    
    for i, params in enumerate(param_grid):
        print(f"\n{'='*60}")
        print(f"Testing Config {i+1}: {params}")
        print(f"{'='*60}")
        
        mlp_pipeline = MLPFlightDelayPipeline(
            categorical_features=categorical_features,
            numerical_features=numerical_features,
            mlp_params={**mlp_params, **params},
        )
        
        cv = FlightDelayCV(
            estimator=mlp_pipeline,
            dataloader=data_loader,
            version=version
        )
        
        cv_results = cv.fit()
        test_results = cv.evaluate()
        
        results.append({
            'config': params,
            'cv_mean_rmse': cv_results.loc['mean', 'rmse'],
            'test_rmse': test_results['rmse'],
            'test_otpa': test_results['otpa'],
            'test_sddr': test_results['sddr'],
        })
        
        print(f"Test RMSE: {test_results['rmse']:.4f}")
        print(f"Test OTPA: {test_results['otpa']:.4f}")
        print(f"Test SDDR: {test_results['sddr']:.4f}")
    
    results_df = pd.DataFrame(results)
    return results_df