In [0]:
# Load local modules: Cross Validator
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

import mlflow
mlflow.autolog(disable=True)

In [0]:
from pyspark.sql import functions as F

class MedianRegressor:
    def __init__(self):
        self.value = None

    def fit(self, df):
        self.value = df.approxQuantile("DEP_DELAY", [0.5], 0.01)[0]
        return self

    def transform(self, df):
        return df.withColumn("prediction", F.lit(self.value))

In [0]:
data_loader = cv.FlightDelayDataLoader()
data_loader.load()

In [0]:
# Example on how to pull specific folds, not needed in your notebooks
folds = data_loader.get_version("3M")

In [0]:
cv_set = cv.FlightDelayCV(
    estimator=MedianRegressor(),
    dataloader=data_loader,
    version="3M"
)
cv_set.fit()

In [0]:
cv_set.evaluate()

In [0]:
outcome_vars = [
    'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group',
    'dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group',
    'actual_elapsed_time', 'air_time', 'wheels_on', 'taxi_in', 
    'arr_time', 'taxi_out', 'wheels_off', 'dep_time', 'cancelled', 'diverted'
]

categorical_features = [
    'day_of_week',
    # 'op_unique_carrier', # redundant with op_carrier
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    # 'tail_num', # excluded bc each plane has this code, too many categories
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    # 'report_type' # type of weather report, not super useful
    # 'op_carrier_fl_num' # just the flight number
    # 'distance_group', # likely important, but already captured in 'distance'

    # 'crs_dep_time', # scheduled departure time, already captured in dep_time_blk
    # 'crs_arr_time', # scheduled arrival time already captured in arr_time_blk
    'day_of_month',
    'month', # cyclical patterns
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time', # scheduled flight time
    # 'quarter', # inferred from month
    # 'flights', # number of flights? always 1?
    'distance', # flight distance, probably important
    # 'year', # excluded bc new predictions will always be in a new year
    # # latitude and longitude not very useful in linear regression
    # 'origin_station_lat',
    # 'origin_station_lon',
    # 'origin_airport_lat',
    # 'origin_airport_lon',
    # 'origin_station_dis',
    # 'dest_station_lat',
    # 'dest_station_lon',
    # 'dest_airport_lat',
    # 'dest_airport_lon',
    # 'dest_station_dis',
    # 'latitude',
    # 'longitude',
    'elevation',
]

In [0]:


from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY",
    elasticNetParam=0.0,
)

lr_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, scaler, lr])

In [0]:
cv_lr = cv.FlightDelayCV(
    estimator=lr_pipe,
    dataloader=data_loader,
    version="60M"
)
cv_lr.evaluate()

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

# Feature definitions
outcome_vars = [
    'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group',
    'dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group',
    'actual_elapsed_time', 'air_time', 'wheels_on', 'taxi_in', 
    'arr_time', 'taxi_out', 'wheels_off', 'dep_time', 'cancelled', 'diverted'
]

categorical_features = [
    'day_of_week',
    'op_carrier',
    # 'origin',
    # 'origin_state_abr',
    # 'dest',
    # 'dest_state_abr',
    'dep_time_blk',
    'arr_time_blk',
    'day_of_month',
    'month',
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time',
    'distance',
    'elevation',
]

# Pipeline stages
imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features],
    dropLast=False  # Keep all categories for Random Forest
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

# Random Forest Regressor
# Note: No StandardScaler needed for tree-based models
rf = RandomForestRegressor(
    featuresCol="features", 
    labelCol="DEP_DELAY",
    numTrees=100,
    maxDepth=5,
)

# Pipeline with all categorical features one-hot encoded
rf_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, rf])

In [0]:
cv_rf = cv.FlightDelayCV(
    estimator=rf_pipe,
    dataloader=data_loader,
    version="60M"
)
cv_rf.evaluate()