# Ensemble

In [0]:
import mlflow
mlflow.autolog(disable=True)

import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

graph_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/graph_features.py"
spec = importlib.util.spec_from_file_location("graph_features", graph_features_path)
graph_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(graph_features)

In [0]:
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql import functions as F
from pyspark.ml import PipelineModel # Needed for the fitted stages

# --- Feature Lists (from provided context) ---
categorical_features = [
    'day_of_week', 'op_carrier', 'origin', 'origin_state_abr', 'dest', 
    'dest_state_abr', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting', 
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection', 
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature', 
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 
    'distance', 'elevation'
]

lineage_features = [
    'crs_elapsed_time', 'distance', 'elevation',
]

numerical_features_no_lineage = [
    col for col in numerical_features if col not in lineage_features
]

# --- Helper Function for Feature Engineering Pipeline ---

def create_feature_pipeline(categorical_cols, numerical_cols, unique_suffix):
    """
    Creates a PySpark ML Pipeline for feature engineering up to VectorAssembler,
    using a unique suffix for all intermediate columns to avoid conflicts.
    """
    
    # Use UNIQUE names for ALL intermediate columns based on the suffix
    indexed_cols = [f"{col}_INDEX_{unique_suffix}" for col in categorical_cols]
    vector_cols = [f"{col}_VEC_{unique_suffix}" for col in categorical_cols]
    imputed_cols = [f"{col}_IMPUTED_{unique_suffix}" for col in numerical_cols]
    
    imputer = Imputer(
        inputCols=numerical_cols,
        outputCols=imputed_cols,
        strategy="mean"
    )

    indexer = StringIndexer(
        inputCols=categorical_cols,
        outputCols=indexed_cols,
        handleInvalid="keep"
    )

    encoder = OneHotEncoder(
        inputCols=indexed_cols,
        outputCols=vector_cols
    )

    assembler_input_cols = vector_cols + imputed_cols
    assembler = VectorAssembler(
        inputCols=assembler_input_cols,
        # IMPORTANT: Output column is still 'features' because the regressor expects it
        outputCol="features", 
        handleInvalid="skip"
    )
    
    return Pipeline(stages=[imputer, indexer, encoder, assembler])

# --- VotingEnsemble Class (The Corrected Estimator) ---

class VotingEnsemble:
    """
    A custom Estimator that combines predictions from two distinct 
    RandomForestRegressor models using simple averaging.
    """
    def __init__(self):
        # 1. Feature Pipelines (Must be separate for different feature sets/unique column names)
        # Suffix '_L' for Lineage, '_NL' for No Lineage
        self.feature_pipe_lineage = create_feature_pipeline(
            categorical_features, numerical_features, "L"
        )
        self.feature_pipe_no_lineage = create_feature_pipeline(
            categorical_features, numerical_features_no_lineage, "NL"
        )

        # 2. Regressors (Define separate prediction columns)
        self.regressor_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_lineage", numTrees=50, maxDepth=10
        )
        self.regressor_no_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_no_lineage", numTrees=50, maxDepth=10
        )
        
        # Fitted objects
        self.feature_model_lineage = None
        self.feature_model_no_lineage = None
        self.model_lineage = None
        self.model_no_lineage = None

    def fit(self, df):
        """
        Fits both models using their respective feature pipelines.
        """
        print("Fitting feature pipeline (with Lineage)...")
        # Fit and transform the training data using the Lineage feature set
        self.feature_model_lineage = self.feature_pipe_lineage.fit(df)
        df_lineage_features = self.feature_model_lineage.transform(df)

        print("Fitting feature pipeline (without Lineage)...")
        # Fit and transform the training data using the No Lineage feature set
        self.feature_model_no_lineage = self.feature_pipe_no_lineage.fit(df)
        df_no_lineage_features = self.feature_model_no_lineage.transform(df)
        
        # 2. Fit Regressors on their respective feature sets
        print("Fitting Regressor (with Lineage)...")
        self.model_lineage = self.regressor_lineage.fit(df_lineage_features)
        
        print("Fitting Regressor (without Lineage)...")
        self.model_no_lineage = self.regressor_no_lineage.fit(df_no_lineage_features)
        
        return self

    def transform(self, df):
        """
        Applies both fitted feature and model stages and computes the average 
        prediction.
        """
        # Define key columns needed for joining predictions (excluding features/labels)
        # This assumes your DataFrame has columns that uniquely identify a flight row.
        # A common practice is to have an explicit ID column, but we use an 
        # approximate set here as a placeholder.
        key_cols = [
            'month', 'day_of_month', 'day_of_week', 'op_carrier', 
            'origin', 'dest', 'dep_time_blk', 'arr_time_blk'
        ]

        # --- Model 1: Lineage ---
        # 1. Transform features
        df_lineage_features = self.feature_model_lineage.transform(df)
        # 2. Predict
        df_pred_1 = self.model_lineage.transform(df_lineage_features)
        
        # --- Model 2: No Lineage ---
        # 1. Transform features
        df_no_lineage_features = self.feature_model_no_lineage.transform(df)
        # 2. Predict
        df_pred_2 = self.model_no_lineage.transform(df_no_lineage_features)
        
        # --- Ensemble ---
        
        # Extract required columns for joining
        # We need the prediction column and the key columns for joining
        df_pred_1_selected = df_pred_1.select(F.col('pred_lineage'), *key_cols)
        df_pred_2_selected = df_pred_2.select(F.col('pred_no_lineage'), *key_cols)

        # Join the two prediction columns
        df_final = df_pred_1_selected.join(df_pred_2_selected, on=key_cols, how='inner') \
                                     .join(df.select(*key_cols, 'DEP_DELAY'), on=key_cols, how='inner') 

        # Compute the ensemble prediction (Averaging)
        df_final = df_final.withColumn(
            "prediction",
            (F.col("pred_lineage") + F.col("pred_no_lineage")) / F.lit(2.0)
        )
        
        # Final output for the CV framework: keep only the prediction, label, and key columns
        return df_final.select(F.col("prediction"), F.col("DEP_DELAY"), *key_cols)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql import functions as F
from pyspark.ml import Estimator, Transformer 
from pyspark.ml import PipelineModel # Needed for the fitted stages


# --- Feature Lists (from provided context) ---
# Define these outside the class to be accessible by helper functions
categorical_features = [
    'day_of_week', 'op_carrier', 'origin', 'origin_state_abr', 'dest', 
    'dest_state_abr', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting', 
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection', 
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature', 
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 
    'distance', 'elevation'
]

lineage_features = [
    'crs_elapsed_time', 'distance', 'elevation', # Features used to define Lineage group
]

# Numerical features for the 'No Lineage' model (environmental/time features)
numerical_features_no_lineage = [
    col for col in numerical_features if col not in lineage_features
]

# --- Helper Function for Feature Engineering Pipeline ---

def create_feature_pipeline(categorical_cols, numerical_cols, unique_suffix):
    """
    Creates a PySpark ML Pipeline for feature engineering up to VectorAssembler,
    using a unique suffix for all intermediate columns to avoid conflicts.
    """
    
    # Use UNIQUE names for ALL intermediate columns based on the suffix
    indexed_cols = [f"{col}_INDEX_{unique_suffix}" for col in categorical_cols]
    vector_cols = [f"{col}_VEC_{unique_suffix}" for col in categorical_cols]
    imputed_cols = [f"{col}_IMPUTED_{unique_suffix}" for col in numerical_cols]
    
    imputer = Imputer(
        inputCols=numerical_cols,
        outputCols=imputed_cols,
        strategy="mean"
    )

    indexer = StringIndexer(
        inputCols=categorical_cols,
        outputCols=indexed_cols,
        handleInvalid="keep"
    )

    encoder = OneHotEncoder(
        inputCols=indexed_cols,
        outputCols=vector_cols
    )

    assembler_input_cols = vector_cols + imputed_cols
    assembler = VectorAssembler(
        inputCols=assembler_input_cols,
        outputCol="features", # Output column is still 'features' as regressor expects it
        handleInvalid="skip"
    )
    
    return Pipeline(stages=[imputer, indexer, encoder, assembler])


# --- VotingEnsemble Class (The Corrected Estimator) ---

class VotingEnsemble:
    """
    A custom Estimator that combines predictions from two distinct 
    RandomForestRegressor models using simple averaging.
    
    NOTE: The internal Pipelines are created inside fit() to ensure they are fresh
    on every CV fold, preventing IllegalArgumentException.
    """
    def __init__(self):
        # 1. Regressors (Define separate prediction columns)
        self.regressor_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_lineage", numTrees=50, maxDepth=10
        )
        self.regressor_no_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_no_lineage", numTrees=50, maxDepth=10
        )
        
        # Fitted objects
        self.feature_model_lineage = None
        self.feature_model_no_lineage = None
        self.model_lineage = None
        self.model_no_lineage = None
        
        # Define key columns for joining predictions
        self.key_cols = [
            'month', 'day_of_month', 'day_of_week', 'op_carrier', 
            'origin', 'dest', 'dep_time_blk', 'arr_time_blk'
        ]
        
        # Define all required label columns for the CV Evaluator
        self.required_labels = ['DEP_DELAY', 'DEP_DEL15', 'SEVERE_DEL60']

    def fit(self, df):
        """
        Fits both models using their respective feature pipelines.
        """
        
        # --- 1. Instantiate FRESH Feature Pipelines (Fix for CV reuse) ---
        feature_pipe_lineage = create_feature_pipeline(
            categorical_features, numerical_features, "L"
        )
        feature_pipe_no_lineage = create_feature_pipeline(
            categorical_features, numerical_features_no_lineage, "NL"
        )
        # ------------------------------------------------------------------

        print("Fitting feature pipeline (with Lineage)...")
        # Fit and store the fitted feature transformer model
        self.feature_model_lineage = feature_pipe_lineage.fit(df)
        df_lineage_features = self.feature_model_lineage.transform(df).cache()

        print("Fitting feature pipeline (without Lineage)...")
        # Fit and store the fitted feature transformer model
        self.feature_model_no_lineage = feature_pipe_no_lineage.fit(df)
        df_no_lineage_features = self.feature_model_no_lineage.transform(df).cache()
        
        # 2. Fit Regressors on their respective feature sets
        print("Fitting Regressor (with Lineage)...")
        self.model_lineage = self.regressor_lineage.fit(df_lineage_features)
        
        print("Fitting Regressor (without Lineage)...")
        self.model_no_lineage = self.regressor_no_lineage.fit(df_no_lineage_features)
        
        # Unpersist intermediate DataFrames
        df_lineage_features.unpersist()
        df_no_lineage_features.unpersist()
        
        return self

    def transform(self, df):
        """
        Applies both fitted feature and model stages and computes the average 
        prediction, ensuring all required label columns are retained for evaluation.
        """
        # Ensure the feature models and regressors have been fitted
        if not (self.feature_model_lineage and self.model_lineage and 
                self.feature_model_no_lineage and self.model_no_lineage):
            raise Exception("VotingEnsemble must be fitted before transforming.")

        # --- Model 1: Lineage ---
        df_lineage_features = self.feature_model_lineage.transform(df)
        df_pred_1 = self.model_lineage.transform(df_lineage_features)
        df_pred_1_selected = df_pred_1.select(F.col('pred_lineage'), *self.key_cols)
        
        # --- Model 2: No Lineage ---
        df_no_lineage_features = self.feature_model_no_lineage.transform(df)
        df_pred_2 = self.model_no_lineage.transform(df_no_lineage_features)
        df_pred_2_selected = df_pred_2.select(F.col('pred_no_lineage'), *self.key_cols)

        # --- Ensemble ---
        
        # Join predictions and the key columns
        df_final = df_pred_1_selected.join(df_pred_2_selected, on=self.key_cols, how='inner') 
        
        # JOIN STEP: Join with the original DataFrame (df) to get ALL required label columns (FIX)
        df_final = df_final.join(
            df.select(*self.key_cols, *self.required_labels), 
            on=self.key_cols, 
            how='inner'
        ) 

        # Compute the ensemble prediction (Averaging)
        df_final = df_final.withColumn(
            "prediction",
            (F.col("pred_lineage") + F.col("pred_no_lineage")) / F.lit(2.0)
        )
        
        # Final output: select prediction, all labels, and key columns
        return df_final.select(F.col("prediction"), *self.required_labels, *self.key_cols)

In [0]:
ensemble_estimator = VotingEnsemble()
cv_ensemble_set = cv.FlightDelayCV(
    estimator=ensemble_estimator,
    dataloader=data_loader,
    version="3M" # Use the desired dataset version
)

In [0]:
results_df = cv_ensemble_set.evaluate()