In [0]:

# Spark Settings
# to avoid OOM Error

spark.catalog.clearCache()

spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "16000") 
try:
    spark.conf.set("spark.databricks.io.cache.enabled", "false")
except:
    pass

print("Spark cache cleared.")

import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)


import uuid
from pathlib import Path
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType


from pyspark.sql import SparkSession, functions as F
from pathlib import Path
from pyspark.ml.feature import (
    Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from pyspark.sql.types import DoubleType

# >>> PYTORCH AND DISTRIBUTOR IMPORTS <<<
import torch
import torch.nn as nn
import torch.optim as optim
from pyspark.ml.torch.distributor import TorchDistributor 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# --- PYTORCH TRAIN FUNCTION (RUNS ON WORKERS) ---

In [0]:
# Enable better logging
import logging
logging.basicConfig(level=logging.DEBUG)


# Suppress py4j verbose logging
logging.getLogger('py4j').setLevel(logging.ERROR)
logging.getLogger('py4j.java_gateway').setLevel(logging.ERROR)
logging.getLogger('py4j.clientserver').setLevel(logging.ERROR)

# Optional: Also suppress other verbose Spark logs
logging.getLogger('org.apache.spark').setLevel(logging.WARN)
logging.getLogger('org.sparkproject').setLevel(logging.WARN)
logging.getLogger('org.apache.hadoop').setLevel(logging.WARN)

# Keep your own logs at INFO level
logging.getLogger(__name__).setLevel(logging.INFO)



# Monitor memory
import psutil
import torch

def print_memory_stats():
    process = psutil.Process()
    print(f"CPU Memory: {process.memory_info().rss / 1024**3:.2f} GB")
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i} Memory: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB / "
                  f"{torch.cuda.max_memory_allocated(i) / 1024**3:.2f} GB max")

In [0]:


# --- SHARED MODEL DEFINITION ---
class PyTorchMLPRegressor(nn.Module):
    """
    Shared MLP architecture for both Training (Workers) and Inference (Driver/Workers).
    """
    def __init__(self, input_dim, hidden_layers, dropout_rate=0.3):
        super().__init__()
        layers = []
        in_features = input_dim
        for units in hidden_layers:
            layers.append(nn.Linear(in_features, units))
            layers.append(nn.BatchNorm1d(units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            in_features = units
        layers.append(nn.Linear(in_features, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x).squeeze(1)
    

In [0]:
# =====================================================
# KEY FIXES FOR 60M DATASET SCALING
# =====================================================

# 1. TRAINING FUNCTION - Optimized for large datasets
def train_fn(params):
    import os, sys, traceback, glob, random, shutil
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.distributed as dist
    from torch.nn.parallel import DistributedDataParallel as DDP
    from torch.utils.data import DataLoader, IterableDataset
    import numpy as np
    import pandas as pd

    # --- Early Stopping Helper ---
    class EarlyStopping:
        def __init__(self, patience=5, min_delta=0.0):
            self.patience = patience
            self.min_delta = min_delta
            self.counter = 0
            self.best_loss = None
            self.early_stop = False

        def __call__(self, val_loss):
            if self.best_loss is None:
                self.best_loss = val_loss
            elif val_loss > self.best_loss - self.min_delta:
                self.counter += 1
                if self.counter >= self.patience:
                    self.early_stop = True
            else:
                self.best_loss = val_loss
                self.counter = 0

    try:
        # --- DDP Init ---
        backend = "nccl" if params["use_gpu"] and torch.cuda.is_available() else "gloo"
        dist.init_process_group(backend=backend)

        if params["use_gpu"] and torch.cuda.is_available():
            local_rank = int(os.environ.get("LOCAL_RANK", 0))
            device = torch.device(f"cuda:{local_rank}")
            torch.cuda.set_device(device)
            device_ids = [local_rank]
        else:
            device = torch.device("cpu")
            device_ids = None

        rank = dist.get_rank()
        world_size = dist.get_world_size()

        # Convert dbfs path
        path_local = params["train_path"].replace("dbfs:/", "/dbfs/") if params["train_path"].startswith("dbfs:/") else params["train_path"]

        # --- FIXED: More robust file listing ---
        all_files = []
        for root, dirs, files in os.walk(path_local):
            for f in files:
                if f.endswith('.parquet') and not f.startswith('.') and not f.startswith('_'):
                    all_files.append(os.path.join(root, f))
        
        all_files = sorted(all_files)
        
        if rank == 0:
            print(f"Found {len(all_files)} parquet files")
        
        # Deterministic shuffle
        random.Random(42).shuffle(all_files)
        
        # 10% Validation Split
        split_idx = max(1, int(len(all_files) * 0.9))
        train_files_global = all_files[:split_idx]
        val_files_global = all_files[split_idx:]

        if not val_files_global:
            print(f"Warning: Not enough files for validation split. Using single file.")
            val_files_global = all_files[-1:]
            train_files_global = all_files[:-1]

        # --- FIXED: Improved Dataset with better error handling ---
        class ParquetFlightIterableDataset(IterableDataset):
            def __init__(self, file_list, rank, world_size):
                self.file_list = file_list
                self.rank = rank
                self.world_size = world_size

            def __iter__(self):
                worker_info = torch.utils.data.get_worker_info()
                if worker_info is None:
                    my_files = self.file_list[self.rank::self.world_size]
                else:
                    # Split by GPU Rank then by CPU Worker
                    gpu_files = self.file_list[self.rank::self.world_size]
                    my_files = gpu_files[worker_info.id::worker_info.num_workers]

                random.shuffle(my_files)
                for f in my_files:
                    try:
                        if not os.path.exists(f):
                            if self.rank == 0:
                                print(f"Warning: File not found: {f}")
                            continue
                            
                        pdf = pd.read_parquet(f, columns=["features_arr", "DEP_DELAY"], engine='pyarrow')
                        if len(pdf) == 0: 
                            continue
                            
                        X = np.stack(pdf["features_arr"].values).astype(np.float32, copy=False)
                        y = pdf["DEP_DELAY"].values.astype(np.float32, copy=False)
                        
                        for i in range(len(y)):
                            yield torch.from_numpy(X[i]), torch.tensor(y[i])
                    except Exception as e:
                        if self.rank == 0:
                            print(f"Error reading {f}: {str(e)}")
                        continue

        # Create DataLoaders with reduced prefetch for memory
        train_ds = ParquetFlightIterableDataset(train_files_global, rank, world_size)
        val_ds = ParquetFlightIterableDataset(val_files_global, rank, world_size)

        # FIXED: Reduced prefetch_factor to save memory
        train_loader = DataLoader(train_ds, batch_size=params["batch_size"], 
                                   num_workers=6, prefetch_factor=1, pin_memory=True) #updated num workers to 6 from 2, reduced prefetch to 1
        val_loader = DataLoader(val_ds, batch_size=params["batch_size"], 
                                num_workers=6, prefetch_factor=1, pin_memory=True)

        # --- Model Setup ---
        model = PyTorchMLPRegressor(params["input_dim"], params["hidden_layers"], params["dropout_rate"]).to(device)
        ddp_model = DDP(model, device_ids=device_ids)
        optimizer = optim.Adam(ddp_model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"])
        criterion = nn.MSELoss()
        
        # Initialize Early Stopping
        early_stopper = EarlyStopping(patience=params.get("patience", 5))

        # --- Training Loop ---
        for epoch in range(params["epochs"]):
            # 1. Train
            ddp_model.train()
            train_loss_sum = 0.0
            train_batches = 0
            
            for xb, yb in train_loader:
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                out = ddp_model(xb)
                loss = criterion(out, yb)
                loss.backward()
                optimizer.step()
                train_loss_sum += loss.item()
                train_batches += 1

            # 2. Validation
            ddp_model.eval()
            val_loss_sum = 0.0
            val_batches = 0
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb, yb = xb.to(device), yb.to(device)
                    out = ddp_model(xb)
                    loss = criterion(out, yb)
                    val_loss_sum += loss.item()
                    val_batches += 1
            
            # 3. Aggregate Metrics across GPUs
            metrics_tensor = torch.tensor([train_loss_sum, train_batches, val_loss_sum, val_batches], device=device)
            dist.all_reduce(metrics_tensor, op=dist.ReduceOp.SUM)
            
            global_train_loss = metrics_tensor[0] / max(metrics_tensor[1], 1)
            global_val_loss = metrics_tensor[2] / max(metrics_tensor[3], 1)

            if rank == 0:
                print(f"Epoch {epoch+1}/{params['epochs']} | Train Loss: {global_train_loss:.4f} | Val Loss: {global_val_loss:.4f}")

            # 4. Check Early Stopping
            early_stopper(global_val_loss.item())
            
            if early_stopper.early_stop:
                if rank == 0:
                    print(f"Early stopping triggered at epoch {epoch+1}")
                break

        # --- Save Model ---
        if rank == 0:
            torch.save(model.state_dict(), params["model_path"])

             
        # CRITICAL FIX: Properly cleanup distributed process group
        dist.barrier()  # Wait for all processes
        dist.destroy_process_group()


    except Exception:
        if rank == 0:
            print(traceback.format_exc())

        # Try to cleanup even on error
        try:
            if dist.is_initialized():
                dist.destroy_process_group()
        except:
            pass
        
        sys.exit(1)

        


# 2. SPARKPYTORCHESTIMATOR - Fixed for large datasets
class SparkPyTorchEstimator:
    def __init__(self, hidden_layers=None, dropout_rate=0.3, learning_rate=0.001, 
                 batch_size=256, epochs=30, num_processes=None, infer_batch_size=None, patience=5, weight_decay=0.00001):
        
        self.hidden_layers = hidden_layers or [128, 64]
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs



        self.infer_batch_size = infer_batch_size or batch_size
        self.patience = patience
        self.weight_decay = weight_decay
        
        self.use_gpu = torch.cuda.is_available()
        
        if num_processes is None:
            if self.use_gpu:
                self.num_processes = torch.cuda.device_count()
            else:
                self.num_processes = 1
        else:
            self.num_processes = max(1, num_processes)

        self.model_path = "/dbfs/tmp/torch_mlp_state_dict.pth"
        self.input_dim = None
        self.num_shards = None
        
    def fit(self, df):
        """
        FIXED: Avoid collecting data, better sharding, explicit cache management
        """
        # FIXED: Clear Spark cache explicitly
        df.sparkSession.catalog.clearCache()
        
        # FIXED: Determine input_dim WITHOUT collecting - use schema instead
        if self.input_dim is None:
            # Get input_dim from the vector metadata instead of collecting
            from pyspark.ml.linalg import VectorUDT
            scaled_features_field = [f for f in df.schema.fields if f.name == "scaled_features"][0]
            
            # For VectorUDT, we need to collect just one row safely
            try:
                # Use take(1) which is safer than collect() for large datasets
                sample = df.select(vector_to_array("scaled_features").alias("features_arr")).take(1)
                if not sample:
                    raise ValueError("No rows in training dataframe.")
                self.input_dim = len(sample[0]["features_arr"])
                print(f"Detected input dimension: {self.input_dim}")
            except Exception as e:
                print(f"Error determining input_dim: {e}")
                raise

        # Prepare train DF (features_arr + label)
        df_train = df.select(
            vector_to_array("scaled_features").alias("features_arr"),
            F.col("DEP_DELAY").cast(DoubleType())
        ).dropna(subset=["features_arr", "DEP_DELAY"])

        unique_id = str(uuid.uuid4())
        train_path = f"dbfs:/tmp/mlp_train_{unique_id}"
        
        # FIXED: Significantly increase sharding for 60M dataset
        # Calculate shards based on data size
        estimated_rows = df_train.count() if hasattr(df_train, '_cached_count') else 60_000_000
        
        # Aim for ~30k rows per shard
        optimal_shards = max(2000, int(estimated_rows / 30000))
        num_shards = min(optimal_shards, 8000)  # Cap at 8000 to avoid too many small files

        # Store for use in transform
        self.num_shards = num_shards
        
        num_columns = len(df_train.columns)
        print(f"Number of columns: {num_columns}")
        print(f"Number of shards: {num_shards}")
        print(f"Estimated rows per shard: {estimated_rows / num_shards:.0f}")

        # FIXED: Add explicit memory management
        df_train.persist()
        
        (
            df_train
            .repartition(num_shards)
            .write
            .mode("overwrite")
            .parquet(train_path)
        )
        
        # FIXED: Unpersist after write to free memory
        df_train.unpersist()
        
        params = {
            "input_dim": self.input_dim,
            "hidden_layers": self.hidden_layers,
            "dropout_rate": self.dropout_rate,
            "learning_rate": self.learning_rate,
            "weight_decay": self.weight_decay,
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "use_gpu": self.use_gpu,
            "model_path": self.model_path,
            "train_path": train_path,
            "patience": self.patience
        }

        distributor = TorchDistributor(
            num_processes=self.num_processes,
            local_mode=False,
            use_gpu=self.use_gpu,
        )

        # Ensure model path directory exists
        model_path_obj = Path(self.model_path)
        model_path_obj.parent.mkdir(parents=True, exist_ok=True)

        print(f"Starting distributed training. Processes: {self.num_processes}, CUDA: {self.use_gpu}")
        distributor.run(train_fn, params)

        # load model on driver
        self.trained_model = PyTorchMLPRegressor(
            self.input_dim, self.hidden_layers, self.dropout_rate
        )
        self.trained_model.load_state_dict(torch.load(self.model_path))
        self.trained_model.eval()

        return self

    def transform(self, df):
        """
        FIXED: Smaller batch size for inference, better memory management
        """
        if not hasattr(self, 'trained_model'):
            raise ValueError("Model not fitted.")
        
        # Serialize model weights for broadcast
        model_state_dict = self.trained_model.state_dict()
        schema = df.schema.add("prediction", DoubleType())

        # FIXED: Use smaller batch size for inference to avoid OOM
        use_gpu = self.use_gpu
        infer_batch_size = min(self.infer_batch_size, 512)  # Cap at 512 for safety
        
        print(f"Using inference batch size: {infer_batch_size}")

        def predict_partition_full(iterator):
            # 1. Setup device
            device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
            
            # 2. Initialize and load model ONCE per worker task
            worker_model = PyTorchMLPRegressor(
                self.input_dim, self.hidden_layers, self.dropout_rate
            ).to(device)
            worker_model.load_state_dict(model_state_dict)
            worker_model.eval()
            
            with torch.no_grad():
                for pdf_batch in iterator:
                    if len(pdf_batch) == 0:
                        continue
                        
                    # Extract features from this Spark partition batch
                    X_np = np.stack(pdf_batch["features_arr"].values)
                    n = X_np.shape[0]

                    # Pre-allocate predictions on CPU as float64
                    preds_all = np.empty(n, dtype=np.float64)

                    # --- BATCHED INFERENCE ---
                    for start in range(0, n, infer_batch_size):
                        end = min(start + infer_batch_size, n)
                        inputs = torch.from_numpy(X_np[start:end]).float().to(device)
                        preds = worker_model(inputs).cpu().numpy().astype(np.float64)
                        preds_all[start:end] = preds
                        
                        # FIXED: Explicit cleanup for GPU memory
                        del inputs
                        if use_gpu:
                            torch.cuda.empty_cache()

                    pdf_batch["prediction"] = preds_all
                    yield pdf_batch.drop(columns=["features_arr"])

        # Add the array column temporarily
        df_with_arr = df.withColumn("features_arr", vector_to_array("scaled_features"))
        
        # FIXED: Repartition before mapInPandas for better parallelism
        # Use fewer partitions for inference to reduce overhead
        num_inference_partitions = min(self.num_shards // 4, 2000)
        df_with_arr = df_with_arr.repartition(num_inference_partitions)
        
        # Final transform
        return df_with_arr.mapInPandas(predict_partition_full, schema=schema)


# 3. MLP PIPELINE - No changes needed, but added for completeness
class MLPFlightDelayPipeline:
    """
    Wrapper that combines Spark preprocessing + PyTorch MLP into a single estimator.
    """
    
    def __init__(
        self,
        categorical_features,
        numerical_features,
        mlp_params=None,
    ):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.mlp_params = mlp_params or {}
        
        self.preprocessing_pipeline = None
        self.pytorch_estimator = None
        
    def _build_preprocessing_pipeline(self):
        imputer = Imputer(
            inputCols=self.numerical_features,
            outputCols=[f"{col}_IMPUTED" for col in self.numerical_features],
            strategy="mean"
        )
        
        indexer = StringIndexer(
            inputCols=self.categorical_features,
            outputCols=[f"{col}_INDEX" for col in self.categorical_features],
            handleInvalid="keep"
        )
        
        encoder = OneHotEncoder(
            inputCols=[f"{col}_INDEX" for col in self.categorical_features],
            outputCols=[f"{col}_VEC" for col in self.categorical_features],
            dropLast=False
        )
        
        assembler = VectorAssembler(
            inputCols=[f"{col}_VEC" for col in self.categorical_features] + 
                      [f"{col}_IMPUTED" for col in self.numerical_features],
            outputCol="features",
            handleInvalid="skip"
        )
        
        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled_features",
            withMean=True,
            withStd=True
        )
        
        self.preprocessing_pipeline = Pipeline(
            stages=[imputer, indexer, encoder, assembler, scaler]
        )
        
        return self.preprocessing_pipeline
    
    def fit(self, df):
        # FIXED: Clear cache before fitting
        df.sparkSession.catalog.clearCache()
        
        # Build and fit preprocessing pipeline
        if self.preprocessing_pipeline is None:
            self._build_preprocessing_pipeline()
            temp_df = df
            for col_name in self.numerical_features:
                temp_df = temp_df.withColumn(col_name, F.col(col_name).cast(DoubleType()))
                
            self.preprocessing_pipeline = self.preprocessing_pipeline.fit(temp_df)
        
        # Transform training data
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # FIXED: Persist preprocessed data to avoid recomputation
        preprocessed.persist()
        
        # Build and fit PyTorch Estimator
        self.pytorch_estimator = SparkPyTorchEstimator(**self.mlp_params)
        self.pytorch_estimator.fit(preprocessed)
        
        # FIXED: Unpersist after training
        preprocessed.unpersist()
        
        return self
    
    def transform(self, df):
        if self.preprocessing_pipeline is None or self.pytorch_estimator is None:
            raise ValueError("Pipeline not fitted yet. Call fit() first.")
        
        # Apply preprocessing
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # Generate predictions
        predictions_df = self.pytorch_estimator.transform(preprocessed)
        
        return predictions_df

In [0]:
data_loader = cv.FlightDelayDataLoader()



# =====================================================
# 4. USAGE WITH FLIGHTDELAYCV
# =====================================================

# Feature definitions
categorical_features = [
    'day_of_week', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting',
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection',
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature',
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 'distance', 'elevation',


    #Flight Lineage Derived Features
    # Scheduled time features (data leakage-free)
    'scheduled_lineage_rotation_time_minutes',
    'scheduled_lineage_turnover_time_minutes',

    # Other known features (data leakage-free)
    'prev_flight_distance',

    # Safe features (intelligent data leakage handling)
    'safe_lineage_rotation_time_minutes', # Duration between the known (or suspected) previous actual departure time and the planned departure time

    # Other flight lineage features
    'lineage_rank', # Number of recorded flights for that airplane
]


# PyTorch hyperparameters (updated for TorchDistributor)
mlp_params = {
    'hidden_layers': [512, 256, 128],
    'dropout_rate': 0.1,
    'learning_rate': 0.001,
    'batch_size': 1024,
    'epochs': 50,           # Increase epochs so early stopping has room to work
    'patience': 10,          # <--- NEW: Stop if val loss doesn't improve for 5 epochs
    'infer_batch_size': 256,

}

# Initialize MLP pipeline
mlp_pipeline = MLPFlightDelayPipeline(
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    mlp_params=mlp_params,
)

 

In [0]:

# Your FlightDelayCV usage (assuming cv.FlightDelayCV is defined/imported elsewhere)
crossvalidator = cv.FlightDelayCV(
    estimator=mlp_pipeline,
    version="60M"
)

# Run cross-validation
cv_results = crossvalidator.fit()
print("Cross-Validation Results:")
print(cv_results)
display(cv_results)


### Debugging


In [0]:
# # Check data distribution
# train_df.rdd.glom().map(len).collect()  # See partition sizes

# # Monitor during training
# print_memory_stats()  # Call periodically

# # Test single partition
# test_partition = train_df.limit(1000).repartition(1)
# model.fit(test_partition)  # Should work quickly

## Cross Validation for 60M

# Model 2 

Batch size = 512
Learning rate = 0.002
Patience = 5 

Model size: 512x256x128

# Using Graph Features 

In [0]:
data_loader = cv.FlightDelayDataLoader()


# =====================================================
# 4. USAGE WITH FLIGHTDELAYCV
# =====================================================

# Feature definitions
categorical_features = [
    'day_of_week', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting',
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection',
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature',
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 'distance', 'elevation',


    #Flight Lineage Derived Features
    # Scheduled time features (data leakage-free)
    'scheduled_lineage_rotation_time_minutes',
    'scheduled_lineage_turnover_time_minutes',

    # Other known features (data leakage-free)
    'prev_flight_distance',

    # Safe features (intelligent data leakage handling)
    'safe_lineage_rotation_time_minutes', # Duration between the known (or suspected) previous actual departure time and the planned departure time

    # Other flight lineage features
    'lineage_rank', # Number of recorded flights for that airplane

    # ============================================================================
    # Graph Features
    # ============================================================================

    'prev_flight_origin_pagerank_weighted', # New!
    'prev_flight_origin_pagerank_unweighted', # New!
    'origin_pagerank_weighted',
    'origin_pagerank_unweighted',
    'dest_pagerank_weighted',
    'dest_pagerank_unweighted'
]


# PyTorch hyperparameters (updated for TorchDistributor)
mlp_params = {
    'hidden_layers': [512, 256, 128],
    'dropout_rate': 0.1,
    'learning_rate': 0.0003,
    'weight_decay': 0.00001,
    'batch_size': 1024,
    'epochs': 30,           # Increase epochs so early stopping has room to work
    'patience': 7,          # <--- NEW: Stop if val loss doesn't improve for 5 epochs
    'infer_batch_size': 256,

}

# Initialize MLP pipeline
mlp_pipeline = MLPFlightDelayPipeline(
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    mlp_params=mlp_params,
)

 

In [0]:

data_loader_with_graph = cv.FlightDelayDataLoader(suffix="_with_graph")
data_loader_with_graph.load() 

# Your FlightDelayCV usage (assuming cv.FlightDelayCV is defined/imported elsewhere)
crossvalidator = cv.FlightDelayCV(
    estimator=mlp_pipeline,
    dataloader=data_loader_with_graph,
    version="60M"
)

# Run cross-validation
cv_results = crossvalidator.fit_fold(0)
print("Cross-Validation Results:")
print(cv_results)
display(cv_results)


In [0]:
def hard_cleanup_spark_pytorch():
    """
    Aggressively clean up Spark + PyTorch + GPU state.
    Safe to call multiple times.
    """

    print("Starting hard cleanup...")

    # ---- Spark cleanup ----
    try:
        spark = SparkSession.builder.getOrCreate()
        spark.catalog.clearCache()

        # Kill lingering broadcasts
        for b in spark.sparkContext._jsc.sc().getPersistentRDDs().values():
            b.unpersist()
    except Exception as e:
        print(f"Spark cleanup warning: {e}")

    # ---- PyTorch cleanup ----
    try:
        import torch

        if torch.cuda.is_available():
            torch.cuda.synchronize()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
    except Exception as e:
        print(f"PyTorch cleanup warning: {e}")

    # ---- Python GC ----
    import gc
    gc.collect()

    print("âœ“ Hard cleanup complete\n")


hard_cleanup_spark_pytorch()

In [0]:


# Evaluate using model 2 (last model)
cv_eval_60M = crossvalidator.evaluate()
print(cv_eval_60M)
display(cv_eval_60M)