MLP Model using PyTorch + GPU Support
=====================================

# Summary

- Here we build Distributed MLP models. 


In [0]:

# Spark Settings
# to avoid OOM Error
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "32768") 

import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

In [0]:
# %run "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/Cross Validator Module"

In [0]:
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

In [0]:
# =====================================================
# PYTORCH MLP REGRESSOR INTEGRATED WITH TORCHDISTRIBUTOR
# =====================================================

import uuid
from pathlib import Path
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

from pyspark.sql import SparkSession, functions as F
from pathlib import Path
from pyspark.ml.feature import (
    Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from pyspark.sql.types import DoubleType

# >>> PYTORCH AND DISTRIBUTOR IMPORTS <<<
import torch
import torch.nn as nn
import torch.optim as optim
from pyspark.ml.torch.distributor import TorchDistributor 
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# --- PYTORCH TRAIN FUNCTION (RUNS ON WORKERS) ---

def train_fn(X, y, params):
    """
    Run on each worker launched by TorchDistributor.
    No Spark code in here.
    """
    import os
    import torch
    import torch.nn as nn
    import torch.optim as optim
    import torch.distributed as dist
    from torch.nn.parallel import DistributedDataParallel as DDP
    from torch.utils.data import TensorDataset, DataLoader, DistributedSampler
    import numpy as np

    # --- local model definition ---
    class PyTorchMLPRegressor_Worker(nn.Module):
        def __init__(self, input_dim, hidden_layers, dropout_rate=0.3):
            super().__init__()
            layers = []
            in_features = input_dim
            for units in hidden_layers:
                layers.append(nn.Linear(in_features, units))
                layers.append(nn.BatchNorm1d(units))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(dropout_rate))
                in_features = units
            layers.append(nn.Linear(in_features, 1))
            self.network = nn.Sequential(*layers)

        def forward(self, x):
            return self.network(x).squeeze(1)

    # --- DDP / process group init ---
    backend = "nccl" if params["use_gpu"] else "gloo"
    dist.init_process_group(backend=backend)

    if params["use_gpu"]:
        local_rank = int(os.environ["LOCAL_RANK"])
        device = torch.device(f"cuda:{local_rank}")
        device_ids = [local_rank]
    else:
        device = torch.device("cpu")
        device_ids = None

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    # --- dataset & sampler ---
    X = torch.from_numpy(np.asarray(X)).float()
    y = torch.from_numpy(np.asarray(y)).float()
    dataset = TensorDataset(X, y)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True)

    loader = DataLoader(
        dataset,
        batch_size=params["batch_size"],
        sampler=sampler,
        drop_last=False,
    )

    # --- model / optimizer / loss ---
    model = PyTorchMLPRegressor_Worker(
        params["input_dim"],
        params["hidden_layers"],
        params["dropout_rate"],
    ).to(device)

    ddp_model = DDP(model, device_ids=device_ids)
    optimizer = optim.Adam(ddp_model.parameters(), lr=params["learning_rate"])
    criterion = nn.MSELoss()

    # --- training loop ---
    for epoch in range(params["epochs"]):
        ddp_model.train()
        sampler.set_epoch(epoch)  # important for DistributedSampler
        total_loss = 0.0
        num_batches = 0

        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            out = ddp_model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        # log from rank 0 only
        if rank == 0:
            avg_loss = total_loss / max(num_batches, 1)
            print(f"Epoch {epoch+1}/{params['epochs']} - Loss: {avg_loss:.4f}")

    # --- save model once on rank 0 ---
    dist.barrier()
    if rank == 0:
        torch.save(model.state_dict(), params["model_path"])

    dist.destroy_process_group()


# --- DRIVER-SIDE MLP MODEL DEFINITION ---

class PyTorchMLPRegressor(nn.Module):
    """
    A standard PyTorch MLP model for regression.
    """
    def __init__(self, input_dim, hidden_layers, dropout_rate=0.3):
        super().__init__()
        
        layers = []
        in_features = input_dim
        
        for units in hidden_layers:
            # Dense Layer
            layers.append(nn.Linear(in_features, units))
            
            # Batch Normalization
            layers.append(nn.BatchNorm1d(units))
            
            # Activation and Regularization
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            
            in_features = units
            
        # Output layer (Regression: 1 unit, no activation)
        layers.append(nn.Linear(in_features, 1))
        
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x).squeeze(1)  # Squeeze to match target shape


# --- PYTORCH SPARK ESTIMATOR ---

class SparkPyTorchEstimator:
    def __init__(self, hidden_layers=None, dropout_rate=0.3, learning_rate=0.001, 
                 batch_size=256, epochs=30, num_processes=None, infer_batch_size=None):
        
        self.hidden_layers = hidden_layers or [128, 64]
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs

        # NEW: separate inference batch size (defaults to training batch size)
        self.infer_batch_size = infer_batch_size or batch_size
        
        # 1. Determine GPU availability
        self.use_gpu = torch.cuda.is_available()
        
        # 2. Determine the number of processes (Fixes NoneType error)
        if num_processes is None:
            if self.use_gpu:
                # If GPU is available, use one process per GPU
                self.num_processes = torch.cuda.device_count()
            else:
                # If CPU is used, default to 1 process if not specified
                self.num_processes = 1
        else:
            self.num_processes = num_processes
            
        # Ensure minimum of 1 process
        if self.num_processes < 1:
            self.num_processes = 1

        self.model_path = "/dbfs/tmp/torch_mlp_state_dict.pth"
        self.input_dim = None
        
    def fit(self, df):
        """
        Uses TorchDistributor for distributed training.
        """
        if self.input_dim is None:
            first_row = df.select(vector_to_array("scaled_features")).head()
            self.input_dim = len(first_row[0])

        df_train = df.select(
            vector_to_array("scaled_features").alias("features_arr"),
            F.col("DEP_DELAY")
        ).dropna()

        # Repartition for better shuffle balance (not used directly by TorchDistributor)
        num_partitions = self.num_processes * 4 * 6
        df_train = df_train.repartition(num_partitions)

        # Collect as pandas and build NumPy arrays
        pdf = df_train.toPandas()
        X = np.stack(pdf["features_arr"].values)
        y = pdf["DEP_DELAY"].values.astype(np.float32)

        params = {
            "input_dim": self.input_dim,
            "hidden_layers": self.hidden_layers,
            "dropout_rate": self.dropout_rate,
            "learning_rate": self.learning_rate,
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "use_gpu": self.use_gpu,
            "model_path": self.model_path,
        }

        distributor = TorchDistributor(
            num_processes=self.num_processes,
            local_mode=False,
            use_gpu=self.use_gpu,
        )

        # Ensure the parent directory for the model file exists
        model_path_obj = Path(self.model_path)
        model_path_obj.parent.mkdir(parents=True, exist_ok=True)
        print(f"Starting distributed training. Processes: {self.num_processes}, CUDA: {self.use_gpu}")

        # NOTE: args are exactly what train_fn expects: (X, y, params)
        distributor.run(train_fn, X, y, params)

        # load model on driver
        self.trained_model = PyTorchMLPRegressor(
            self.input_dim, self.hidden_layers, self.dropout_rate
        )
        self.trained_model.load_state_dict(torch.load(self.model_path)) 
        self.trained_model.eval()

        return self

    def transform(self, df):
        """
        Uses mapInPandas for parallel inference on workers, with explicit batching
        to avoid OOM during evaluation.
        """
        if not hasattr(self, 'trained_model'):
            raise ValueError("Model not fitted.")
        
        # Serialize model weights for broadcast
        model_state_dict = self.trained_model.state_dict()
        schema = df.schema.add("prediction", DoubleType())

        # Determine if workers should use GPU (if driver used GPU)
        use_gpu = self.use_gpu
        infer_batch_size = self.infer_batch_size

        def predict_partition_full(iterator):
            # 1. Setup device (GPU for inference if available)
            device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
            
            # 2. Initialize and load model ONCE per worker task
            worker_model = PyTorchMLPRegressor(
                self.input_dim, self.hidden_layers, self.dropout_rate
            ).to(device)
            worker_model.load_state_dict(model_state_dict)
            worker_model.eval()
            
            with torch.no_grad():
                for pdf_batch in iterator:
                    # Extract features from this Spark partition batch
                    X_np = np.stack(pdf_batch["features_arr"].values)
                    n = X_np.shape[0]

                    # Pre-allocate predictions on CPU as float64 to match DoubleType
                    preds_all = np.empty(n, dtype=np.float64)

                    # --- BATCHED INFERENCE TO AVOID OOM ---
                    for start in range(0, n, infer_batch_size):
                        end = min(start + infer_batch_size, n)
                        inputs = torch.from_numpy(X_np[start:end]).float().to(device)
                        preds = worker_model(inputs).cpu().numpy().astype(np.float64)
                        preds_all[start:end] = preds

                    pdf_batch["prediction"] = preds_all
                    yield pdf_batch.drop(columns=["features_arr"])

        # Add the array column temporarily
        df_with_arr = df.withColumn("features_arr", vector_to_array("scaled_features"))
        
        # Final transform
        return df_with_arr.mapInPandas(predict_partition_full, schema=schema)
    

# =====================================================
# 2. MLP PIPELINE WRAPPER
# =====================================================

class MLPFlightDelayPipeline:
    """
    Wrapper that combines Spark preprocessing + PyTorch MLP into a single estimator.
    """
    
    def __init__(
        self,
        categorical_features,
        numerical_features,
        mlp_params=None,
    ):
        self.categorical_features = categorical_features
        self.numerical_features = numerical_features
        self.mlp_params = mlp_params or {}
        
        self.preprocessing_pipeline = None
        self.pytorch_estimator = None
        
    def _build_preprocessing_pipeline(self):
        imputer = Imputer(
            inputCols=self.numerical_features,
            outputCols=[f"{col}_IMPUTED" for col in self.numerical_features],
            strategy="mean"
        )
        
        indexer = StringIndexer(
            inputCols=self.categorical_features,
            outputCols=[f"{col}_INDEX" for col in self.categorical_features],
            handleInvalid="keep"
        )
        
        encoder = OneHotEncoder(
            inputCols=[f"{col}_INDEX" for col in self.categorical_features],
            outputCols=[f"{col}_VEC" for col in self.categorical_features],
            dropLast=False
        )
        
        assembler = VectorAssembler(
            inputCols=[f"{col}_VEC" for col in self.categorical_features] + 
                      [f"{col}_IMPUTED" for col in self.numerical_features],
            outputCol="features",
            handleInvalid="skip"
        )
        
        scaler = StandardScaler(
            inputCol="features",
            outputCol="scaled_features",
            withMean=True,
            withStd=True
        )
        
        self.preprocessing_pipeline = Pipeline(
            stages=[imputer, indexer, encoder, assembler, scaler]
        )
        
        return self.preprocessing_pipeline
    
    def fit(self, df):
        # Build and fit preprocessing pipeline
        if self.preprocessing_pipeline is None:
            self._build_preprocessing_pipeline()
            # Ensure numerical columns are DoubleType before fitting the Imputer
            temp_df = df
            for col_name in self.numerical_features:
                temp_df = temp_df.withColumn(col_name, F.col(col_name).cast(DoubleType()))
                
            self.preprocessing_pipeline = self.preprocessing_pipeline.fit(temp_df)
        
        # Transform training data
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # Build and fit PyTorch Estimator
        self.pytorch_estimator = SparkPyTorchEstimator(**self.mlp_params)
        self.pytorch_estimator.fit(preprocessed)
        
        return self
    
    def transform(self, df):
        if self.preprocessing_pipeline is None or self.pytorch_estimator is None:
            raise ValueError("Pipeline not fitted yet. Call fit() first.")
        
        # Apply preprocessing
        preprocessed = self.preprocessing_pipeline.transform(df)
        
        # Generate predictions
        predictions_df = self.pytorch_estimator.transform(preprocessed)
        
        return predictions_df
    

# =====================================================
# 4. USAGE WITH FLIGHTDELAYCV
# =====================================================

# Feature definitions
categorical_features = [
    'day_of_week', 'op_carrier', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting',
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection',
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature',
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 'distance', 'elevation'
]


# PyTorch hyperparameters (updated for TorchDistributor)
mlp_params = {
    'hidden_layers': [512, 256, 128],
    'dropout_rate': 0.1,
    'learning_rate': 0.005,
    'batch_size': 512,
    'epochs': 30,
    # OPTIONAL: different batch size for inference to be extra safe
    'infer_batch_size': 256,
    # Optional: Set num_processes to override automatic GPU/CPU detection
    # 'num_processes': 4
}

# Initialize MLP pipeline
mlp_pipeline = MLPFlightDelayPipeline(
    categorical_features=categorical_features,
    numerical_features=numerical_features,
    mlp_params=mlp_params,
)

# Your FlightDelayCV usage (assuming cv.FlightDelayCV is defined/imported elsewhere)
crossvalidator = cv.FlightDelayCV(
    estimator=mlp_pipeline,
    version="12M"
)

# Run cross-validation
cv_results = crossvalidator.fit()
print("Cross-Validation Results:")
print(cv_results)

 