#Gradient Boosted Trees - Custom Join Data

In [0]:
import importlib.util
import sys

# Load cv module directly from file path
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)

In [0]:
outcome_vars = [
    'arr_delay', 'arr_delay_new', 'arr_del15', 'arr_delay_group',
    'dep_delay', 'dep_delay_new', 'dep_del15', 'dep_delay_group',
    'actual_elapsed_time', 'air_time', 'wheels_on', 'taxi_in', 
    'arr_time', 'taxi_out', 'wheels_off', 'dep_time', 'cancelled', 'diverted'
]

categorical_features = [
    'day_of_week',
    # 'op_unique_carrier', # redundant with op_carrier
    'op_carrier',
    'origin', # origin airport code
    'origin_state_abr', # origin state abbreviation
    'dest', # destination airport code
    'dest_state_abr', # destination state abbreviation
    # 'tail_num', # excluded bc each plane has this code, too many categories
    'dep_time_blk', # not outcome var bc this is scheduled departure
    'arr_time_blk', # not outcome var bc this is scheduled arrival
    # 'report_type' # type of weather report, not super useful
    # 'op_carrier_fl_num' # just the flight number
    # 'distance_group', # likely important, but already captured in 'distance'

    # 'crs_dep_time', # scheduled departure time, already captured in dep_time_blk
    # 'crs_arr_time', # scheduled arrival time already captured in arr_time_blk
    'day_of_month',
    'month', # cyclical patterns
]

numerical_features = [
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    'crs_elapsed_time', # scheduled flight time
    # 'quarter', # inferred from month
    # 'flights', # number of flights? always 1?
    'distance', # flight distance, probably important
    # 'year', # excluded bc new predictions will always be in a new year
    # # latitude and longitude not very useful in linear regression
    # 'origin_station_lat',
    # 'origin_station_lon',
    # 'origin_airport_lat',
    # 'origin_airport_lon',
    # 'origin_station_dis',
    # 'dest_station_lat',
    # 'dest_station_lon',
    # 'dest_airport_lat',
    # 'dest_airport_lon',
    # 'dest_station_dis',
    # 'latitude',
    # 'longitude',
    'elevation',
]

In [0]:
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor

imputer = Imputer(
    inputCols=numerical_features,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features],
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LinearRegression(
    featuresCol="scaled_features", 
    labelCol="DEP_DELAY",
    elasticNetParam=0.0,
)

gbt = GBTRegressor(
    featuresCol="features",
    labelCol="DEP_DELAY"
)

# lr_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, scaler, lr])
gbt_pipe = Pipeline(stages=[imputer, indexer, encoder, assembler, gbt])

In [0]:
cv_gbt_3M = cv.FlightDelayCV(
    estimator=gbt_pipe,
    version="3M"
)

cv_gbt_3M.fit()

In [0]:
cv_gbt_3M.evaluate()

In [0]:
cv_gbt_12M = cv.FlightDelayCV(
    estimator=gbt_pipe,
    version="12M"
)
cv_gbt_12M.fit()

In [0]:
cv_gbt_12M.evaluate()

In [0]:
cv_gbt = cv.FlightDelayCV(
    estimator=gbt_pipe,
    version="60M"
)
cv_gbt.fit()

In [0]:
cv_gbt.evaluate()

In [0]:
cv_gbt = cv.FlightDelayCV(
    estimator=gbt_pipe,
    version="60M"
)
cv_gbt.fit()

## Try Graph Features

In [0]:
graph_features_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Feature Engineering/graph_features.py"
spec = importlib.util.spec_from_file_location("graph_features", graph_features_path)
graph_features = importlib.util.module_from_spec(spec)
spec.loader.exec_module(graph_features)

In [0]:
# Graph features that will be added by GraphFeaturesEstimator
graph_feature_cols = [
    'origin_pagerank_weighted',
    'origin_pagerank_unweighted',
    'dest_pagerank_weighted',
    'dest_pagerank_unweighted'
]

In [0]:
# Graph Features Estimator (builds graph and adds PageRank features)
graph_estimator = graph_features.GraphFeaturesEstimator(
    origin_col="origin",
    dest_col="dest",
    reset_probability=0.15,
    max_iter=10
)

imputer = Imputer(
    inputCols=numerical_features + graph_feature_cols,
    outputCols=[f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols],
    strategy="mean"
)

indexer = StringIndexer(
    inputCols=categorical_features,
    outputCols=[f"{col}_INDEX" for col in categorical_features],
    handleInvalid="keep"
)

encoder = OneHotEncoder(
    inputCols=[f"{col}_INDEX" for col in categorical_features],
    outputCols=[f"{col}_VEC" for col in categorical_features]
)

assembler = VectorAssembler(
    inputCols=[f"{col}_VEC" for col in categorical_features] + 
              [f"{col}_IMPUTED" for col in numerical_features + graph_feature_cols],
    outputCol="features",
    handleInvalid="skip"
)

# Pipeline with graph features
gbt_graph_pipe = Pipeline(stages=[graph_estimator, imputer, indexer, encoder, assembler, gbt])

In [0]:
cv_graph_3M = cv.FlightDelayCV(
    estimator=gbt_graph_pipe,
    version="3M"
)
cv_graph_3M.fit()

In [0]:
cv_graph_3M.evaluate()

In [0]:
cv_graph_12M = cv.FlightDelayCV(
    estimator=gbt_graph_pipe,
    version="12M"
)
cv_graph_12M.fit()

In [0]:
cv_graph_12M.evaluate()

In [0]:
cv_graph_60M = cv.FlightDelayCV(
    estimator=gbt_graph_pipe,
    version="60M"
)
cv_graph_60M.fit()

In [0]:
cv_graph_60M.evaluate()

In [0]:
`   data_BASE_DIR = "dbfs:/mnt/mids-w261/"
display(dbutils.fs.ls(f"{data_BASE_DIR}/student-groups/Group_4_2/processed/"))