## FLIGHT STATUS PREDICTOR PROJECT - Notebook 3

#### The goal of this project is to develop and deploy an ML model in which an end user can specify a set of features describing a commercial flight of interest and receive a categorical (yes or no) output indicating if the arrival time of that flight will be delayed more than fifteen minutes or not. 

### Recreate the preprocessor for the web app

In [29]:
# Import libraries
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import shap
import matplotlib.pyplot as plt

In [30]:
# Load the dataset
data = pd.read_csv(r'C:\Users\Windows 11\Desktop\FSPapp folder\selected_features.csv')

In [31]:
# Define features and target
numerical_features = ['Arr_Hour', 'Dep_Hour', 'Dep_Time_Day_Interaction', 'Distance_Miles', 'Is_Weekend', 'Month', 'Number_of_Flights', 'Scheduled_Arrival_Time', 'Scheduled_Departure_Time', 'Scheduled_Gate_to_Gate_Time', 'Week_Day']
categorical_features = ['Carrier_Name', 'Destination_City_State', 'Origin_City_State']
target = 'Arr_Delay_At_Least_15_Minutes'

In [32]:
# Perform stratified sampling
X = data[numerical_features + categorical_features]
y = data[target]
X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.8, stratify=y, random_state=42)

In [33]:
# Split the sampled data
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [34]:
# Preprocessor setup
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [35]:
# Create pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

In [36]:
# Train the model on the sample
model_pipeline.fit(X_train_sample, y_train_sample)

In [37]:
# Hyperparameter tuning
# Define parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [3, 5],
    'classifier__learning_rate': [0.01, 0.1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=3, n_jobs=-1, scoring='roc_auc', verbose=2)

# Fit GridSearchCV on the sample
grid_search.fit(X_train_sample, y_train_sample)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [38]:
# Get best parameters and best model from the sample
best_params = grid_search.best_params_
best_model_sample = grid_search.best_estimator_

In [40]:
# Clean the best parameters by removing the 'classifier__' prefix
cleaned_best_params = {key.split('__')[1]: value for key, value in best_params.items()}

print("Best parameters found:", cleaned_best_params)

Best parameters found: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [42]:
# Retrain on full dataset with the best parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model_full = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, **cleaned_best_params))
])
best_model_full.fit(X_train, y_train)

In [43]:
# Save the retrained model and preprocessor
joblib.dump(best_model_full, 'GradientBoosting_best_model_full.pkl')
joblib.dump(preprocessor, 'preprocessor_full.pkl')

['preprocessor_full.pkl']