In [0]:
import mlflow
# Disable autolog to prevent excessive logging during custom CV loops
mlflow.autolog(disable=True)

import importlib.util
import sys
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql import functions as F

# -------------------------------------------------------------------------
# 1. LOAD CUSTOM MODULES (CV & Graph Features)
# -------------------------------------------------------------------------
# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

# Load graph_features module
graph_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/graph_features.py"
spec = importlib.util.spec_from_file_location("graph_features", graph_features_path)
graph_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(graph_features)


In [0]:
dataloader = cv.FlightDelayDataLoader()
dataloader.load()

In [0]:
# -------------------------------------------------------------------------
# 2. FEATURE DEFINITIONS
# -------------------------------------------------------------------------
categorical_features = [
    'day_of_week', 'op_carrier', 'origin', 'origin_state_abr', 'dest', 
    'dest_state_abr', 'dep_time_blk', 'arr_time_blk', 'day_of_month', 'month'
]

numerical_features = [
    'hourlyprecipitation', 'hourlysealevelpressure', 'hourlyaltimetersetting', 
    'hourlywetbulbtemperature', 'hourlystationpressure', 'hourlywinddirection', 
    'hourlyrelativehumidity', 'hourlywindspeed', 'hourlydewpointtemperature', 
    'hourlydrybulbtemperature', 'hourlyvisibility', 'crs_elapsed_time', 
    'distance', 'elevation'
]

lineage_features = [
    'crs_elapsed_time', 'distance', 'elevation',
]

numerical_features_no_lineage = [
    col for col in numerical_features if col not in lineage_features
]

# -------------------------------------------------------------------------
# 3. HELPER FUNCTION
# -------------------------------------------------------------------------
def create_feature_pipeline(categorical_cols, numerical_cols, unique_suffix):
    """
    Creates a PySpark ML Pipeline for feature engineering.
    Uses a unique suffix to ensure internal column names don't clash.
    """
    indexed_cols = [f"{col}_INDEX_{unique_suffix}" for col in categorical_cols]
    vector_cols = [f"{col}_VEC_{unique_suffix}" for col in categorical_cols]
    imputed_cols = [f"{col}_IMPUTED_{unique_suffix}" for col in numerical_cols]
    
    imputer = Imputer(
        inputCols=numerical_cols, outputCols=imputed_cols, strategy="mean"
    )
    indexer = StringIndexer(
        inputCols=categorical_cols, outputCols=indexed_cols, handleInvalid="keep"
    )
    encoder = OneHotEncoder(
        inputCols=indexed_cols, outputCols=vector_cols
    )
    assembler = VectorAssembler(
        inputCols=vector_cols + imputed_cols, outputCol="features", handleInvalid="skip"
    )
    
    return Pipeline(stages=[imputer, indexer, encoder, assembler])

# -------------------------------------------------------------------------
# 4. VOTING ENSEMBLE ESTIMATOR (Corrected)
# -------------------------------------------------------------------------
class VotingEnsemble:
    def __init__(self):
        # We define the regressors here, but we do NOT define the Pipelines here.
        self.regressor_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_lineage", numTrees=50, maxDepth=10
        )
        self.regressor_no_lineage = RandomForestRegressor(
            featuresCol="features", labelCol="DEP_DELAY", 
            predictionCol="pred_no_lineage", numTrees=50, maxDepth=10
        )
        
        # Placeholders for fitted models
        self.feature_model_lineage = None
        self.feature_model_no_lineage = None
        self.model_lineage = None
        self.model_no_lineage = None
        
        # Columns needed to join predictions and labels
        self.key_cols = [
            'month', 'day_of_month', 'day_of_week', 'op_carrier', 
            'origin', 'dest', 'dep_time_blk', 'arr_time_blk'
        ]
        # Labels required by the FlightDelayEvaluator
        self.required_labels = ['DEP_DELAY', 'DEP_DEL15', 'SEVERE_DEL60']

    def fit(self, df):
        """
        Creates FRESH feature pipelines for this specific fit call (crucial for CV),
        fits them, transforms data, and then fits the regressors.
        """
        # --- 1. Create FRESH Pipelines (Fixes IllegalArgumentException) ---
        feature_pipe_lineage = create_feature_pipeline(
            categorical_features, numerical_features, "L"
        )
        feature_pipe_no_lineage = create_feature_pipeline(
            categorical_features, numerical_features_no_lineage, "NL"
        )

        print("Fitting feature pipeline (Lineage)...")
        self.feature_model_lineage = feature_pipe_lineage.fit(df)
        df_lineage_features = self.feature_model_lineage.transform(df).cache()

        print("Fitting feature pipeline (No Lineage)...")
        self.feature_model_no_lineage = feature_pipe_no_lineage.fit(df)
        df_no_lineage_features = self.feature_model_no_lineage.transform(df).cache()
        
        # --- 2. Fit Regressors ---
        print("Fitting Regressor (Lineage)...")
        self.model_lineage = self.regressor_lineage.fit(df_lineage_features)
        
        print("Fitting Regressor (No Lineage)...")
        self.model_no_lineage = self.regressor_no_lineage.fit(df_no_lineage_features)
        
        # Cleanup memory
        df_lineage_features.unpersist()
        df_no_lineage_features.unpersist()
        
        return self

    def transform(self, df):
        """
        Transforms data using both models, averages predictions, and returns
        the DataFrame structured exactly as the Evaluator expects.
        """
        if not (self.model_lineage and self.model_no_lineage):
            raise Exception("VotingEnsemble must be fitted before transforming.")

        # --- Model 1 Prediction ---
        df_lineage_feat = self.feature_model_lineage.transform(df)
        pred_1 = self.model_lineage.transform(df_lineage_feat)
        pred_1_sel = pred_1.select(F.col('pred_lineage'), *self.key_cols)
        
        # --- Model 2 Prediction ---
        df_no_lineage_feat = self.feature_model_no_lineage.transform(df)
        pred_2 = self.model_no_lineage.transform(df_no_lineage_feat)
        pred_2_sel = pred_2.select(F.col('pred_no_lineage'), *self.key_cols)

        # --- Join Predictions ---
        df_final = pred_1_sel.join(pred_2_sel, on=self.key_cols, how='inner')
        
        # --- Join REQUIRED LABELS from original DF (Fixes UNRESOLVED_COLUMN) ---
        # We need DEP_DEL15 and SEVERE_DEL60 for the evaluator to work
        df_final = df_final.join(
            df.select(*self.key_cols, *self.required_labels),
            on=self.key_cols,
            how='inner'
        )

        # --- Ensemble Average ---
        df_final = df_final.withColumn(
            "prediction",
            (F.col("pred_lineage") + F.col("pred_no_lineage")) / F.lit(2.0)
        )
        
        # Return strict column set to avoid leaking feature columns
        return df_final.select(F.col("prediction"), *self.required_labels, *self.key_cols)

# -------------------------------------------------------------------------
# 5. EXECUTION
# -------------------------------------------------------------------------
# 1. Load Data
print("Loading data...")
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

# 2. Instantiate Estimator
ensemble = VotingEnsemble()

# 3. Setup Cross Validator
print("Initializing Cross Validator...")
cv_ensemble_set = cv.FlightDelayCV(
    estimator=ensemble,
    dataloader=data_loader,
    version="3M"  # Change to "12M" or "60M" as needed
)

# 4. Run Fit
print("Starting CV Fit...")
results_df = cv_ensemble_set.fit()

In [0]:
results_df

In [0]:
cv_ensemble_set = cv.FlightDelayCV(
    estimator=ensemble,
    dataloader=data_loader,
    version="12M"
)

# 4. Run Fit
print("Starting CV Fit...")
results_df = cv_ensemble_set.fit()

In [0]:
results_df

In [0]:
cv_ensemble_set = cv.FlightDelayCV(
    estimator=ensemble,
    dataloader=data_loader,
    version="60M"
)

# 4. Run Fit
print("Starting CV Fit...")
results_df = cv_ensemble_set.fit()