# Demo: Model with **Preloaded** Engineered Features

In [0]:
# Dependencies
import sys
import pandas as pd

# Spark Session (Databricks provides this automatically, but we'll create it explicitly for compatibility)
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()


# Load modules from our Databricks repo
import importlib.util
cv_path = "/Workspace/Shared/Team 4_2/flight-departure-delay-predictive-modeling/notebooks/Cross Validator/cv.py"
spec = importlib.util.spec_from_file_location("cv", cv_path)
cv = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cv)


from pyspark.sql import functions as F
from pyspark.ml.feature import Imputer, StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, SQLTransformer
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml import Pipeline

## Run Data Loader (REQUIRED)
**IMPORTANT**: `data_loader = FlightDelayDataLoader(suffix="_with_graph_and_metamodels")` MUST be ran and the data_loader passed to your cross validator as seen below.

```
cv_obj = cv.FlightDelayCV(
    estimator=lr_pipe,
    dataloader = data_loader,
    version="3M"
)
cv_obj.fit()
```

### What's Available
**Graph Features**: All Folds
**Meta Models**
* 3M, 12M, and 60M fold have
  * `predicted_prev_flight_total_duration_XGB_1'`

* Generated from [this script](https://dbc-fae72cab-cf59.cloud.databricks.com/editor/files/1920497510759291?o=4021782157704243)  
  * `predicted_prev_flight_air_time_XGB_1`
  * `predicted_prev_flight_turnover_time_XGB_1`


In [0]:
# Run Data Loader

data_loader = cv.FlightDelayDataLoader(suffix="_with_graph_and_metamodels")
data_loader.load()  # Must call load() manually when providing dataloader

## Define Features

Just include graph features in your numerical_features

In [0]:
# ============================================================================
# CATEGORICAL FEATURES
# ============================================================================
categorical_features = [
    # Core flight identifiers
    'op_carrier',              # Operating carrier (AA, UA, DL, etc.)
    'origin',                  # Origin airport code
    'origin_state_abr',        # Origin state abbreviation
    'dest',                    # Destination airport code
    'dest_state_abr',          # Destination state abbreviation
    
    # Temporal categorical features
    'month',                   # Month (1-12) - cyclical patterns
    'dep_time_blk',            # Scheduled departure time block
    'arr_time_blk',            # Scheduled arrival time block
    
    # Previous flight categorical features (if available)
    'prev_flight_origin',      # Previous flight origin airport
    'prev_flight_dest',        # Previous flight destination airport
    'prev_flight_op_carrier',  # Previous flight carrier
]

# ============================================================================
# NUMERICAL FEATURES - COMPREHENSIVE LIST
# ============================================================================
numerical_features = [
    # ============================================================================
    # Core Flight Characteristics
    # ============================================================================
    'crs_elapsed_time',        # Scheduled elapsed time (air time)
    'distance',                # Flight distance (miles)
    'elevation',               # Airport elevation (if available)
    
    # ============================================================================
    # Flight Lineage - Core Rank & Sequence
    # ============================================================================
    'lineage_rank',            # Rank of flight in aircraft's sequence
    'lineage_num_previous_flights',  # Number of previous flights in lineage
    
    # ============================================================================
    # Flight Lineage - Scheduled Times (Data Leakage Free)
    # ============================================================================
    'scheduled_lineage_rotation_time_minutes',      # AVAILABLE TIME: Scheduled rotation time (prev_crs_dep → curr_crs_dep)
    'scheduled_lineage_turnover_time_minutes',      # Scheduled turnover time (prev_crs_arr → curr_crs_dep)
    'prev_flight_scheduled_flight_time_minutes',    # Scheduled flight time for previous flight
    'prev_flight_crs_elapsed_time',                 # Scheduled air time for previous flight
    
    # ============================================================================
    # Flight Lineage - Safe Features (Data Leakage Free, Intelligent Handling)
    # ============================================================================
    'safe_lineage_rotation_time_minutes',           # AVAILABLE TIME (Safe): Safe rotation time (handles data leakage)
    'safe_prev_departure_delay',                    # Safe previous flight departure delay
    'safe_prev_arrival_delay',                      # Safe previous flight arrival delay
    'safe_time_since_prev_arrival',                 # Time since previous flight arrived
    'safe_required_time_prev_flight_minutes',       # Safe required time
    'safe_impossible_on_time_flag',                 # Binary: required_time > rotation_time
    
    # ============================================================================
    # Flight Lineage - Previous Flight Characteristics
    # ============================================================================
    'prev_flight_distance',                         # Previous flight distance
    'prev_flight_air_time',                         # Previous flight actual air time
    'prev_flight_taxi_in',                          # Previous flight taxi-in time
    'prev_flight_taxi_out',                         # Previous flight taxi-out time
    'prev_flight_actual_elapsed_time',              # Previous flight total elapsed time
    
    # ============================================================================
    # Flight Lineage - Cumulative & Aggregated Features
    # ============================================================================
    'lineage_cumulative_delay',                     # Sum of delays before immediate previous
    'lineage_avg_delay_previous_flights',           # Average delay before immediate previous
    'lineage_max_delay_previous_flights',           # Maximum delay before immediate previous
    'lineage_expected_flight_time_minutes',         # Expected flight time (historical)
    
    # ============================================================================
    # Flight Lineage - Required Time Features
    # ============================================================================
    'required_time_prev_flight_minutes',            # Expected air_time + expected_turnover_time
    
    # ============================================================================
    # Meta-Model Predictions (Pre-computed)
    # ============================================================================
    'predicted_prev_flight_air_time_XGB_1',         # Predicted previous flight air time
    'predicted_prev_flight_turnover_time_XGB_1',    # Predicted previous flight turnover time
    'predicted_prev_flight_total_duration_XGB_1',   # Predicted previous flight total duration
    
    # ============================================================================
    # Current Origin Weather Variables
    # ============================================================================
    'hourlyprecipitation',
    'hourlysealevelpressure',
    'hourlyaltimetersetting',
    'hourlywetbulbtemperature',
    'hourlystationpressure',
    'hourlywinddirection',
    'hourlyrelativehumidity',
    'hourlywindspeed',
    'hourlydewpointtemperature',
    'hourlydrybulbtemperature',
    'hourlyvisibility',
    
    # ============================================================================
    # Previous Flight Weather Variables (OTPW dataset only)
    # ============================================================================
    # Hourly weather at previous flight's origin
    'prev_flight_hourlyprecipitation',
    'prev_flight_hourlywindspeed',
    'prev_flight_hourlywinddirection',
    'prev_flight_hourlyvisibility',
    'prev_flight_hourlydrybulbtemperature',
    'prev_flight_hourlydewpointtemperature',
    'prev_flight_hourlyrelativehumidity',
    'prev_flight_hourlysealevelpressure',
    'prev_flight_hourlystationpressure',
    'prev_flight_hourlyaltimetersetting',
    'prev_flight_hourlywetbulbtemperature',
    'prev_flight_hourlywindgustspeed',
    'prev_flight_hourlypressurechange',
    'prev_flight_hourlypressuretendency',
    
    # Daily weather at previous flight's origin
    'prev_flight_dailyprecipitation',
    'prev_flight_dailyaveragewindspeed',
    'prev_flight_dailypeakwindspeed',
    'prev_flight_dailymaximumdrybulbtemperature',
    'prev_flight_dailyminimumdrybulbtemperature',
    'prev_flight_dailyaveragedrybulbtemperature',
    'prev_flight_dailysnowfall',
    'prev_flight_dailysnowdepth',
    
    # Monthly weather at previous flight's origin
    'prev_flight_monthlyaveragerh',
    'prev_flight_monthlydeparturefromnormalaveragetemperature',
    
    # ============================================================================
    # Graph Features (If available via add_graph_features.py)
    # ============================================================================
    'origin_pagerank_weighted',                     # Origin airport importance (weighted)
    'origin_pagerank_unweighted',                   # Origin airport importance (unweighted)
    'dest_pagerank_weighted',                       # Destination airport importance (weighted)
    'dest_pagerank_unweighted',                     # Destination airport importance (unweighted)
    'prev_flight_origin_pagerank_weighted',         # Previous flight origin importance
    'prev_flight_origin_pagerank_unweighted',       # Previous flight origin importance
]

## Run Model

In [0]:
cv_lr_3m = cv.FlightDelayCV(
    estimator=lr_pipe,
    dataloader = data_loader,
    version="3M"
)
cv_lr_3m.fit()

In [0]:
cv_lr_3m.evaluate()


In [0]:
cv_lr_60m = cv.FlightDelayCV(
    estimator=lr_pipe,
    dataloader = data_loader,
    version="60M"
)
cv_lr_60m.fit()

In [0]:
cv_lr_60m.evaluate()