In [0]:
"""
cv.py (simplified, CUSTOM-only, no parametrization)

Assumptions:
- Folds were created from split.py with N_FOLDS = 3 and CREATE_TEST_FOLD = True
- Therefore total fold indices written = 4:
    FOLD_1_VAL, FOLD_2_VAL, FOLD_3_VAL, FOLD_4_TEST
- Files live in:
    dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed
- File naming:
    OTPW_CUSTOM_{VERSION}_FOLD_{i}_{TRAIN|VAL|TEST}.parquet
"""

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import mlflow
mlflow.autolog(disable=True)

# -----------------------------
# HARD-CODED GLOBALS
# -----------------------------
FOLDER_PATH = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed"
SOURCE = "CUSTOM"
VERSIONS = ["3M", "12M"]

# 3 CV folds + 1 test fold = 4 total fold indices
TOTAL_FOLDS = 4


class FlightDelayDataLoader:
    """
    CUSTOM-only loader that guarantees all numerical features are cast to double.
    """

    def __init__(self):
        self.folder_path = "dbfs:/mnt/mids-w261/student-groups/Group_4_2/processed"
        self.source = "CUSTOM"
        self.folds = {}
        self.versions = ["3M", "12M"]

        self.numerical_features = [
            'hourlyprecipitation',
            'hourlysealevelpressure',
            'hourlyaltimetersetting',
            'hourlywetbulbtemperature',
            'hourlystationpressure',
            'hourlywinddirection',
            'hourlyrelativehumidity',
            'hourlywindspeed',
            'hourlydewpointtemperature',
            'hourlydrybulbtemperature',
            'hourlyvisibility',
            'crs_elapsed_time', # scheduled flight time
            'quarter', # inferred from month
            'flights', # number of flights? always 1?
            'distance', # flight distance, probably important
            'year', # excluded bc new predictions will always be in a new year
            # latitude and longitude not very useful in linear regression
            'origin_station_lat',
            'origin_station_lon',
            'origin_airport_lat',
            'origin_airport_lon',
            'origin_station_dis',
            'dest_station_lat',
            'dest_station_lon',
            'dest_airport_lat',
            'dest_airport_lon',
            'dest_station_dis',
            'latitude',
            'longitude',
            'elevation',
        ]

    def _cast_numerics(self, df):
        """
        Safely cast all configured numeric columns to doubles.
        Handles common bad values like '', 'NA', 'M', 'T', '.', etc.
        """

        # Patterns that should be treated as null
        NULL_PAT = r'^(NA|N/A|NULL|null|None|none|\\N|\\s*|\\.|M|T)$'

        for colname in self.numerical_features:
            if colname in df.columns:
                df = df.withColumn(
                    colname,
                    F.regexp_replace(F.col(colname).cast("string"), NULL_PAT, "")
                    .cast("double")
                )

        # Explicitly cast labels to expected numeric types
        if "DEP_DELAY" in df.columns:
            df = df.withColumn("DEP_DELAY", F.col("DEP_DELAY").cast("double"))
        if "DEP_DEL15" in df.columns:
            df = df.withColumn("DEP_DEL15", F.col("DEP_DEL15").cast("int"))
        if "SEVERE_DEL60" in df.columns:
            df = df.withColumn("SEVERE_DEL60", F.col("SEVERE_DEL60").cast("int"))

        return df

    def _load_parquet(self, name):
        spark = SparkSession.builder.getOrCreate()
        df = spark.read.parquet(f"{self.folder_path}/{name}.parquet")
        df = self._cast_numerics(df)
        return df

    def _load_version(self, version):
        folds = []
        for fold_idx in range(1, 4 + 1):  # 3 CV folds + 1 test
            train_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_TRAIN"
            train_df = self._load_parquet(train_name)

            if fold_idx < 4:
                val_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_VAL"
                val_df = self._load_parquet(val_name)
                folds.append((train_df, val_df))
            else:
                test_name = f"OTPW_{self.source}_{version}_FOLD_{fold_idx}_TEST"
                test_df = self._load_parquet(test_name)
                folds.append((train_df, test_df))

        return folds

    def load(self):
        for version in self.versions:
            self.folds[version] = self._load_version(version)

    def get_version(self, version):
        return self.folds[version]

# -----------------------------
# EVALUATOR (NULL-SAFE RMSE)
# -----------------------------
class FlightDelayEvaluator:
    def __init__(
        self,
        prediction_col="prediction",
        numeric_label_col="DEP_DELAY",
        binary_label_col="DEP_DEL15",
        severe_label_col="SEVERE_DEL60",
    ):
        self.prediction_col = prediction_col
        self.numeric_label_col = numeric_label_col
        self.binary_label_col = binary_label_col
        self.severe_label_col = severe_label_col

        self.rmse_evaluator = RegressionEvaluator(
            predictionCol=prediction_col,
            labelCol=numeric_label_col,
            metricName="rmse"
        )

    def calculate_rmse(self, predictions_df):
        # Drop any residual nulls before RegressionEvaluator sees them
        clean = predictions_df.dropna(
            subset=[self.numeric_label_col, self.prediction_col]
        )
        return self.rmse_evaluator.evaluate(clean)

    def _calculate_classification_metrics(self, predictions_df, threshold, label_col):
        # Null-safe for classification too
        df = predictions_df.dropna(subset=[self.prediction_col, label_col])

        pred_binary_col = f"pred_binary_{threshold}"
        df = df.withColumn(
            pred_binary_col,
            F.when(F.col(self.prediction_col) >= threshold, 1).otherwise(0)
        )

        tp = df.filter((F.col(pred_binary_col) == 1) & (F.col(label_col) == 1)).count()
        fp = df.filter((F.col(pred_binary_col) == 1) & (F.col(label_col) == 0)).count()
        tn = df.filter((F.col(pred_binary_col) == 0) & (F.col(label_col) == 0)).count()
        fn = df.filter((F.col(pred_binary_col) == 0) & (F.col(label_col) == 1)).count()

        total = tp + fp + tn + fn
        precision = tp / (tp + fp) if (tp + fp) else 0.0
        recall = tp / (tp + fn) if (tp + fn) else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
        accuracy = (tp + tn) / total if total else 0.0

        return dict(tp=tp, fp=fp, tn=tn, fn=fn,
                    precision=precision, recall=recall, f1=f1, accuracy=accuracy)

    def calculate_otpa_metrics(self, predictions_df):
        return self._calculate_classification_metrics(
            predictions_df, threshold=15, label_col=self.binary_label_col
        )["accuracy"]

    def calculate_sddr_metrics(self, predictions_df):
        return self._calculate_classification_metrics(
            predictions_df, threshold=60, label_col=self.severe_label_col
        )["recall"]

    def evaluate(self, predictions_df):
        return {
            "rmse": self.calculate_rmse(predictions_df),
            "otpa": self.calculate_otpa_metrics(predictions_df),
            "sddr": self.calculate_sddr_metrics(predictions_df),
        }


# -----------------------------
# CROSS-VALIDATOR (NO PARAMS)
# -----------------------------
class FlightDelayCV:
    def __init__(self, estimator, dataloader, version):
        self.estimator = estimator
        self.version = version

        if dataloader:
            self.data_loader = dataloader
        else:
            self.data_loader = FlightDelayDataLoader()
            self.data_loader.load()

        self.evaluator = FlightDelayEvaluator()
        self.folds = self.data_loader.get_version(version)

        self.metrics = []
        self.models = []
        self.test_metric = None
        self.test_model = None

    def fit(self):
        # CV folds only (exclude last test fold)
        for train_df, val_df in self.folds[:-1]:
            model = self.estimator.fit(train_df)
            preds = model.transform(val_df)

            metric = self.evaluator.evaluate(preds)
            self.metrics.append(metric)
            self.models.append(model)

        m = pd.DataFrame(self.metrics)
        m.loc["mean"] = m.mean()
        m.loc["std"] = m.std()
        return m

    def evaluate(self):
        train_df, test_df = self.folds[-1]
        self.test_model = self.estimator.fit(train_df)
        preds = self.test_model.transform(test_df)
        self.test_metric = self.evaluator.evaluate(preds)
        return self.test_metric

In [0]:
class MedianRegressor:
    def __init__(self):
        self.value = None

    def fit(self, df):
        self.value = df.approxQuantile("DEP_DELAY", [0.5], 0.01)[0]
        return self

    def transform(self, df):
        return df.withColumn("prediction", F.lit(self.value))

In [0]:
data_loader = FlightDelayDataLoader()
data_loader.load()

In [0]:
folds = data_loader.get_version("3M")

In [0]:
cv = FlightDelayCV(
    estimator=MedianRegressor(),
    dataloader=data_loader,
    version="12M"
)
cv.fit()

In [0]:
outcome_vars = [
    'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group',
    'dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group',
    'actual_elapsed_time', 'air_time', 'wheels_on', 'taxi_in', 
    'arr_time', 'taxi_out', 'wheels_off', 'dep_time', 'cancelled', 'diverted'
]

categorical_features = [
    'day_of_week',
    # 'op_unique_carrier', # redundant with op_carrier
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    # 'tail_num', # excluded bc each plane has this code, too many categories
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    # 'report_type' # type of weather report, not super useful
    # 'op_carrier_fl_num' # just the flight number
    # 'distance_group', # likely important, but already captured in 'distance'

    # 'crs_dep_time', # scheduled departure time, already captured in dep_time_blk
    # 'crs_arr_time', # scheduled arrival time already captured in arr_time_blk
    'day_of_month',
    'month', # cyclical patterns
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time', # scheduled flight time
    # 'quarter', # inferred from month
    # 'flights', # number of flights? always 1?
    'distance', # flight distance, probably important
    # 'year', # excluded bc new predictions will always be in a new year
    # # latitude and longitude not very useful in linear regression
    # 'origin_station_lat',
    # 'origin_station_lon',
    # 'origin_airport_lat',
    # 'origin_airport_lon',
    # 'origin_station_dis',
    # 'dest_station_lat',
    # 'dest_station_lon',
    # 'dest_airport_lat',
    # 'dest_airport_lon',
    # 'dest_station_dis',
    # 'latitude',
    # 'longitude',
    'elevation',
]

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY",
    elasticNetParam=0.0,
)

lr_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, scaler, lr])

In [0]:
cv = FlightDelayCV(
    estimator=lr_pipe,
    dataloader=data_loader,
    version="12M"
)
cv.fit()

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

# Feature definitions
outcome_vars = [
    'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group',
    'dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group',
    'actual_elapsed_time', 'air_time', 'wheels_on', 'taxi_in', 
    'arr_time', 'taxi_out', 'wheels_off', 'dep_time', 'cancelled', 'diverted'
]

categorical_features = [
    'day_of_week',
    'op_carrier',
    # 'origin',
    # 'origin_state_abr',
    # 'dest',
    # 'dest_state_abr',
    'dep_time_blk',
    'arr_time_blk',
    'day_of_month',
    'month',
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time',
    'distance',
    'elevation',
]

# Pipeline stages
imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features],
    dropLast=False  # Keep all categories for Random Forest
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

# Random Forest Regressor
# Note: No StandardScaler needed for tree-based models
rf = RandomForestRegressor(
    featuresCol="features", 
    labelCol="DEP_DELAY",
    numTrees=10,
    maxDepth=2,
)

# Pipeline with all categorical features one-hot encoded
rf_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, rf])

# Cross-validation
cv = FlightDelayCV(
    estimator=rf_pipe,
    dataloader=data_loader,
    version="12M"
)
cv.fit()