In [0]:
# Load local modules: Cross Validator
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

import mlflow
mlflow.autolog(disable=True)

In [0]:
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

In [0]:
from pyspark.sql.functions import col, isnan, regexp_replace, when, length, trim
from pyspark.sql.types import DoubleType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression

class BaselineEstimator:
    """
    Baseline Linear Regression Estimator with Feature Engineering Pipeline.
    
    This class encapsulates the entire feature engineering and modeling pipeline:
        1. Data preparation (label cleaning, feature selection)
        2. Numerical feature cleaning (remove non-numeric chars, handle nulls)
        3. Median imputation for numerical features
        4. Categorical encoding (StringIndexer + OneHotEncoder)
        5. Feature assembly and standardization
        6. Linear regression modeling
    
    Feature Families:
        - Temporal: DAY_OF_WEEK, MONTH, DEP_TIME_BLK
        - Airport: ORIGIN, DEST
        - Flight: OP_UNIQUE_CARRIER, DISTANCE
        - Weather: HourlyWindSpeed, HourlyVisibility, HourlyPrecipitation
    
    Why This Design:
        - Encapsulation: All preprocessing logic in one place
        - Reusability: Same pipeline for train/val/test
        - Spark ML compatibility: Uses Pipeline for efficient execution
    """
    
    def __init__(self, label_col="DEP_DELAY"):
        """
        Initialize the estimator with feature definitions.
        
        Args:
            label_col (str): Name of the target variable column
        """
        self.label_col = label_col
        self.pipeline = None
        self.model = None
        
        # Categorical features: Encoded using StringIndexer + OneHotEncoder
        # These capture temporal patterns, route characteristics, and carrier effects
        self.categorical_features = [
            "DAY_OF_WEEK",        # Day of week (1=Monday, 7=Sunday)
            "MONTH",              # Month of year (1-12)
            "DEP_TIME_BLK",       # Departure time block (e.g., "0600-0659")
            "ORIGIN",             # Origin airport code
            "DEST",               # Destination airport code
            "OP_UNIQUE_CARRIER"   # Operating carrier code
        ]
        self.categorical_features = [colname.lower() for colname in self.categorical_features]
        
        # Numerical features: Imputed with median and standardized
        # These capture weather conditions and flight distance
        self.numerical_features = [
            "HourlyWindSpeed",       # Wind speed at origin (mph)
            "HourlyVisibility",      # Visibility at origin (miles)
            "HourlyPrecipitation",   # Precipitation at origin (inches)
            "DISTANCE"               # Flight distance (miles)
        ]
        self.numerical_features = [colname.lower() for colname in self.numerical_features]
        
    def _prepare(self, df):
        """
        Prepare the DataFrame for modeling by cleaning the label and numerical features.
        
        Steps:
            1. Cast label to DoubleType (required by Spark ML)
            2. Filter out rows with null/NaN labels (Spark ML requirement)
            3. Select only required features + ALL labels (including binary labels for evaluation)
            4. Clean numerical features:
               - Remove non-numeric characters (e.g., "12.5mph" -> "12.5")
               - Convert empty strings to null
               - Cast to DoubleType
        
        Args:
            df (pyspark.sql.DataFrame): Input DataFrame
            
        Returns:
            pyspark.sql.DataFrame: Cleaned DataFrame
            
        Why This Matters:
            - Spark ML LinearRegression requires DoubleType labels with no nulls
            - Numerical features may contain string artifacts from source data
            - Explicit type casting prevents downstream pipeline errors
            - Binary labels (DEP_DEL15, SEVERE_DEL60) must be preserved for evaluation
        """
        # Cast label to double and filter out null/NaN values
        # Spark ML does not accept null labels
        df = df.withColumn(self.label_col, col(self.label_col).cast(DoubleType()))
        df = df.filter(~(col(self.label_col).isNull() | isnan(col(self.label_col))))

        # Select only the columns we need (features + ALL labels for evaluation)
        # This includes binary labels (DEP_DEL15, SEVERE_DEL60) needed by the evaluator
        selected = [c for c in (self.categorical_features + 
                                self.numerical_features + 
                                [self.label_col, "DEP_DEL15", "SEVERE_DEL60"]) 
                    if c in df.columns]
        df = df.select(*selected)

        # Clean numerical features: remove non-numeric characters, handle empty strings
        for f in self.numerical_features:
            if f in df.columns:
                # Step 1: Cast to string to enable regex operations
                # Step 2: Remove all non-numeric characters except +, -, and .
                df = df.withColumn(f, regexp_replace(col(f).cast(StringType()), r"[^0-9+\-\.]", ""))
                
                # Step 3: Convert empty strings to null (for imputation)
                df = df.withColumn(f, when(length(trim(col(f))) == 0, None).otherwise(col(f)))
                
                # Step 4: Cast to DoubleType for modeling
                df = df.withColumn(f, col(f).cast(DoubleType()))
        
        return df

    def _build_pipeline(self, df):
        """
        Build the Spark ML Pipeline with all feature engineering stages.
        
        Pipeline Stages:
            1. Imputers: Median imputation for numerical features
            2. StringIndexers: Convert categorical strings to indices
            3. OneHotEncoders: Convert indices to binary vectors
            4. VectorAssembler: Combine all features into single vector
            5. StandardScaler: Standardize features (mean=0, std=1)
            6. LinearRegression: Train linear model
        
        Args:
            df (pyspark.sql.DataFrame): Prepared DataFrame
            
        Returns:
            pyspark.sql.DataFrame: DataFrame (may have additional columns from transformations)
            
        Why This Design:
            - Pipeline ensures consistent transformations across train/val/test
            - Median imputation is robust to outliers (better than mean)
            - OneHotEncoding with dropLast=True prevents multicollinearity
            - StandardScaler improves convergence for gradient descent
            - No regularization (regParam=0) for interpretable baseline
        """
        stages = []
        
        # ========================================
        # Stage 1: Median Imputation for Numerical Features
        # ========================================
        # Why median? More robust to outliers than mean
        # Missing weather data is common in aviation datasets
        imputers = [
            Imputer(inputCols=[f], outputCols=[f"{f}_imputed"], strategy="median")
            for f in self.numerical_features if f in df.columns
        ]
        stages.extend(imputers)

        # ========================================
        # Stage 2-3: Categorical Encoding (StringIndexer + OneHotEncoder)
        # ========================================
        # StringIndexer: Converts strings to numeric indices (most frequent = 0)
        # OneHotEncoder: Converts indices to binary vectors (prevents ordinal assumption)
        # handleInvalid="keep": Unseen categories in test set get their own index
        # dropLast=True: Drop last category to prevent multicollinearity
        for f in self.categorical_features:
            if f in df.columns:
                # For numeric-coded categoricals (DAY_OF_WEEK, MONTH, DEP_TIME_BLK),
                # cast to string first to ensure consistent handling
                if f in ["day_of_week", "month", "dep_time_blk"]:
                    df = df.withColumn(f"{f}_clean", 
                                      when(col(f).isNull(), "UNKNOWN").otherwise(col(f).cast(StringType())))
                else:
                    # For string categoricals (ORIGIN, DEST, CARRIER), handle nulls only
                    df = df.withColumn(f"{f}_clean", 
                                      when(col(f).isNull(), "UNKNOWN").otherwise(col(f)))
                
                # Add StringIndexer stage
                stages.append(StringIndexer(inputCol=f"{f}_clean", 
                                           outputCol=f"{f}_indexed", 
                                           handleInvalid="keep"))
                
                # Add OneHotEncoder stage
                stages.append(OneHotEncoder(inputCols=[f"{f}_indexed"], 
                                           outputCols=[f"{f}_encoded"], 
                                           dropLast=True))

        # ========================================
        # Stage 4: Feature Assembly
        # ========================================
        # Combine all features (imputed numerical + encoded categorical) into single vector
        # handleInvalid="skip": Skip rows with invalid values (e.g., NaN after imputation)
        feature_columns = [f"{f}_imputed" for f in self.numerical_features if f in df.columns] + \
                          [f"{f}_encoded" for f in self.categorical_features if f in df.columns]
        assembler = VectorAssembler(inputCols=feature_columns, 
                                    outputCol="features", 
                                    handleInvalid="skip")
        
        # ========================================
        # Stage 5: Feature Standardization
        # ========================================
        # Standardize features to mean=0, std=1
        # withStd=True: Scale to unit variance
        # withMean=True: Center to zero mean
        # Why? Improves convergence and makes coefficients comparable
        scaler = StandardScaler(inputCol="features", 
                               outputCol="scaled_features", 
                               withStd=True, 
                               withMean=True)
        
        # ========================================
        # Stage 6: Linear Regression
        # ========================================
        # Baseline model: No regularization (regParam=0, elasticNetParam=0)
        # maxIter=100: Maximum iterations for convergence
        # Why no regularization? We want an interpretable baseline to understand
        # feature importance before adding complexity
        lr = LinearRegression(featuresCol="scaled_features", 
                            labelCol=self.label_col, 
                            maxIter=100, 
                            regParam=0.0, 
                            elasticNetParam=0.0)

        # Assemble all stages into pipeline
        stages.extend([assembler, scaler, lr])
        self.pipeline = Pipeline(stages=stages)
        
        return df

    def fit(self, df):
        """
        Fit the pipeline on training data.
        
        Args:
            df (pyspark.sql.DataFrame): Training DataFrame
            
        Returns:
            BaselineEstimator: self (for method chaining)
        """
        df_prep = self._prepare(df)
        df_prep = self._build_pipeline(df_prep)
        self.model = self.pipeline.fit(df_prep)
        return self

    def transform(self, df):
        """
        Transform data using the fitted pipeline.
        
        Args:
            df (pyspark.sql.DataFrame): DataFrame to transform (val/test)
            
        Returns:
            pyspark.sql.DataFrame: Transformed DataFrame with predictions
            
        Note:
            We rebuild the pipeline on the input DataFrame to ensure all
            transformation columns are present, but use the fitted model
            for predictions.
        """
        df_prep = self._prepare(df)
        df_prep = self._build_pipeline(df_prep)  # Rebuild to ensure cols present
        return self.model.transform(df_prep)

In [0]:
cv_set = cv.FlightDelayCV(
    estimator=BaselineEstimator(),
    dataloader=data_loader,
    version="60M"
)

cv_set.evaluate()

In [0]:
cv_set = cv.FlightDelayCV(
    estimator=BaselineEstimator(),
    dataloader=data_loader,
    version="60M"
)

cv_set.fit()

In [0]:
from pyspark.sql.functions import col, isnan, regexp_replace, when, length, trim
from pyspark.sql.types import DoubleType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression

class BaselineEstimator:
    """
    Baseline Linear Regression Estimator with Feature Engineering Pipeline.
    
    This class encapsulates the entire feature engineering and modeling pipeline:
        1. Data preparation (label cleaning, feature selection)
        2. Numerical feature cleaning (remove non-numeric chars, handle nulls)
        3. Median imputation for numerical features
        4. Categorical encoding (StringIndexer + OneHotEncoder)
        5. Feature assembly and standardization
        6. Linear regression modeling
    
    Feature Families:
        - Temporal: DAY_OF_WEEK, MONTH, DEP_TIME_BLK
        - Airport: ORIGIN, DEST
        - Flight: OP_UNIQUE_CARRIER, DISTANCE
        - Weather: HourlyWindSpeed, HourlyVisibility, HourlyPrecipitation
    
    Why This Design:
        - Encapsulation: All preprocessing logic in one place
        - Reusability: Same pipeline for train/val/test
        - Spark ML compatibility: Uses Pipeline for efficient execution
    """
    
    def __init__(self, label_col="DEP_DELAY"):
        """
        Initialize the estimator with feature definitions.
        
        Args:
            label_col (str): Name of the target variable column
        """
        self.label_col = label_col
        self.pipeline = None
        self.model = None
        
        # Categorical features: Encoded using StringIndexer + OneHotEncoder
        # These capture temporal patterns, route characteristics, and carrier effects
        self.categorical_features = [
            "DAY_OF_WEEK",        # Day of week (1=Monday, 7=Sunday)
            "MONTH",              # Month of year (1-12)
            "DEP_TIME_BLK",       # Departure time block (e.g., "0600-0659")
            "ORIGIN",             # Origin airport code
            "DEST",               # Destination airport code
            "OP_UNIQUE_CARRIER"   # Operating carrier code
        ]
        self.categorical_features = [colname.lower() for colname in self.categorical_features]
        
        # Numerical features: Imputed with median and standardized
        # These capture weather conditions and flight distance
        self.numerical_features = [
            "HourlyWindSpeed",       # Wind speed at origin (mph)
            "HourlyVisibility",      # Visibility at origin (miles)
            "HourlyPrecipitation",   # Precipitation at origin (inches)
            "DISTANCE"               # Flight distance (miles)
        ]
        self.numerical_features = [colname.lower() for colname in self.numerical_features]
        
    def _prepare(self, df):
        """
        Prepare the DataFrame for modeling by cleaning the label and numerical features.
        
        Steps:
            1. Cast label to DoubleType (required by Spark ML)
            2. Filter out rows with null/NaN labels (Spark ML requirement)
            3. Select only required features + ALL labels (including binary labels for evaluation)
            4. Clean numerical features:
               - Remove non-numeric characters (e.g., "12.5mph" -> "12.5")
               - Convert empty strings to null
               - Cast to DoubleType
        
        Args:
            df (pyspark.sql.DataFrame): Input DataFrame
            
        Returns:
            pyspark.sql.DataFrame: Cleaned DataFrame
            
        Why This Matters:
            - Spark ML LinearRegression requires DoubleType labels with no nulls
            - Numerical features may contain string artifacts from source data
            - Explicit type casting prevents downstream pipeline errors
            - Binary labels (DEP_DEL15, SEVERE_DEL60) must be preserved for evaluation
        """
        # Cast label to double and filter out null/NaN values
        # Spark ML does not accept null labels
        df = df.withColumn(self.label_col, col(self.label_col).cast(DoubleType()))
        df = df.filter(~(col(self.label_col).isNull() | isnan(col(self.label_col))))

        # Select only the columns we need (features + ALL labels for evaluation)
        # This includes binary labels (DEP_DEL15, SEVERE_DEL60) needed by the evaluator
        selected = [c for c in (self.categorical_features + 
                                self.numerical_features + 
                                [self.label_col, "DEP_DEL15", "SEVERE_DEL60"]) 
                    if c in df.columns]
        df = df.select(*selected)

        # Clean numerical features: remove non-numeric characters, handle empty strings
        for f in self.numerical_features:
            if f in df.columns:
                # Step 1: Cast to string to enable regex operations
                # Step 2: Remove all non-numeric characters except +, -, and .
                df = df.withColumn(f, regexp_replace(col(f).cast(StringType()), r"[^0-9+\-\.]", ""))
                
                # Step 3: Convert empty strings to null (for imputation)
                df = df.withColumn(f, when(length(trim(col(f))) == 0, None).otherwise(col(f)))
                
                # Step 4: Cast to DoubleType for modeling
                df = df.withColumn(f, col(f).cast(DoubleType()))
        
        return df

    def _build_pipeline(self, df):
        """
        Build the Spark ML Pipeline with all feature engineering stages.
        
        Pipeline Stages:
            1. Imputers: Median imputation for numerical features
            2. StringIndexers: Convert categorical strings to indices
            3. OneHotEncoders: Convert indices to binary vectors
            4. VectorAssembler: Combine all features into single vector
            5. StandardScaler: Standardize features (mean=0, std=1)
            6. LinearRegression: Train linear model
        
        Args:
            df (pyspark.sql.DataFrame): Prepared DataFrame
            
        Returns:
            pyspark.sql.DataFrame: DataFrame (may have additional columns from transformations)
            
        Why This Design:
            - Pipeline ensures consistent transformations across train/val/test
            - Median imputation is robust to outliers (better than mean)
            - OneHotEncoding with dropLast=True prevents multicollinearity
            - StandardScaler improves convergence for gradient descent
            - No regularization (regParam=0) for interpretable baseline
        """
        stages = []
        
        # ========================================
        # Stage 1: Median Imputation for Numerical Features
        # ========================================
        # Why median? More robust to outliers than mean
        # Missing weather data is common in aviation datasets
        imputers = [
            Imputer(inputCols=[f], outputCols=[f"{f}_imputed"], strategy="median")
            for f in self.numerical_features if f in df.columns
        ]
        stages.extend(imputers)

        # ========================================
        # Stage 2-3: Categorical Encoding (StringIndexer + OneHotEncoder)
        # ========================================
        # StringIndexer: Converts strings to numeric indices (most frequent = 0)
        # OneHotEncoder: Converts indices to binary vectors (prevents ordinal assumption)
        # handleInvalid="keep": Unseen categories in test set get their own index
        # dropLast=True: Drop last category to prevent multicollinearity
        for f in self.categorical_features:
            if f in df.columns:
                # For numeric-coded categoricals (DAY_OF_WEEK, MONTH, DEP_TIME_BLK),
                # cast to string first to ensure consistent handling
                if f in ["day_of_week", "month", "dep_time_blk"]:
                    df = df.withColumn(f"{f}_clean", 
                                      when(col(f).isNull(), "UNKNOWN").otherwise(col(f).cast(StringType())))
                else:
                    # For string categoricals (ORIGIN, DEST, CARRIER), handle nulls only
                    df = df.withColumn(f"{f}_clean", 
                                      when(col(f).isNull(), "UNKNOWN").otherwise(col(f)))
                
                # Add StringIndexer stage
                stages.append(StringIndexer(inputCol=f"{f}_clean", 
                                           outputCol=f"{f}_indexed", 
                                           handleInvalid="keep"))
                
                # Add OneHotEncoder stage
                stages.append(OneHotEncoder(inputCols=[f"{f}_indexed"], 
                                           outputCols=[f"{f}_encoded"], 
                                           dropLast=True))

        # ========================================
        # Stage 4: Feature Assembly
        # ========================================
        # Combine all features (imputed numerical + encoded categorical) into single vector
        # handleInvalid="skip": Skip rows with invalid values (e.g., NaN after imputation)
        feature_columns = [f"{f}_imputed" for f in self.numerical_features if f in df.columns] + \
                          [f"{f}_encoded" for f in self.categorical_features if f in df.columns]
        assembler = VectorAssembler(inputCols=feature_columns, 
                                    outputCol="features", 
                                    handleInvalid="skip")
        
        # ========================================
        # Stage 5: Feature Standardization
        # ========================================
        # Standardize features to mean=0, std=1
        # withStd=True: Scale to unit variance
        # withMean=True: Center to zero mean
        # Why? Improves convergence and makes coefficients comparable
        scaler = StandardScaler(inputCol="features", 
                               outputCol="scaled_features", 
                               withStd=True, 
                               withMean=True)
        
        # ========================================
        # Stage 6: Linear Regression
        # ========================================
        # Baseline model: No regularization (regParam=0, elasticNetParam=0)
        # maxIter=100: Maximum iterations for convergence
        # Why no regularization? We want an interpretable baseline to understand
        # feature importance before adding complexity
        lr = LinearRegression(featuresCol="scaled_features", 
                            labelCol=self.label_col, 
                            maxIter=100, 
                            regParam=0.0, 
                            elasticNetParam=0.0)

        # Assemble all stages into pipeline
        stages.extend([assembler, scaler, lr])
        self.pipeline = Pipeline(stages=stages)
        
        return df

    def fit(self, df):
        """
        Fit the pipeline on training data.
        
        Args:
            df (pyspark.sql.DataFrame): Training DataFrame
            
        Returns:
            BaselineEstimator: self (for method chaining)
        """
        df_prep = self._prepare(df)
        df_prep = self._build_pipeline(df_prep)
        self.model = self.pipeline.fit(df_prep)
        return self

    def transform(self, df):
        """
        Transform data using the fitted pipeline.
        
        Args:
            df (pyspark.sql.DataFrame): DataFrame to transform (val/test)
            
        Returns:
            pyspark.sql.DataFrame: Transformed DataFrame with predictions
            
        Note:
            We rebuild the pipeline on the input DataFrame to ensure all
            transformation columns are present, but use the fitted model
            for predictions.
        """
        df_prep = self._prepare(df)
        df_prep = self._build_pipeline(df_prep)  # Rebuild to ensure cols present
        return self.model.transform(df_prep)

In [0]:
cv_set = cv.FlightDelayCV(
    estimator=BaselineEstimator(),
    dataloader=data_loader,
    version="60M"
)

cv_set.evaluate()