## Parameters
For predicting the delay of a flight we need: 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_AT = pd.read_excel("datasets/AirportTraffic.xlsx")
df_TIn = pd.read_excel("datasets/Taxi-In_Additional_Time.xlsx")
df_TOut = pd.read_excel("datasets/Taxi-Out_Additional_Time.xlsx")
df_AAD = pd.read_excel("datasets/AA_ATFM_Delay.xlsx")

In [3]:
df_AAD["FLT_DATE"] = df_AT["FLT_DATE"].dt.strftime("%-d-%b-%Y")
df_AAD["Is_Weekend"] = df_AT["FLT_DATE"].dt.dayofweek.isin([5, 6]).astype(int)

In [4]:
df_AT['FLT_DATE'] = pd.to_datetime(df_AT['FLT_DATE'])
day_of_week = df_AT['FLT_DATE'].dt.dayofweek

# Create the new column ('Is_Weekend'): 1 yes, 0 no
df_AT['Is_Weekend'] = np.where(day_of_week >= 5, 1, 0)
df_AT["Is_Weekend"] = df_AT["Is_Weekend"].astype(int)

Prepare AT dataset

In [5]:
df_top20 = (
    df_AT.groupby(["APT_ICAO"])[["FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1"]] #Group by airport code
    .sum().sort_values(by="FLT_TOT_1",ascending=False) #Sum the values for each code of the 3 columns indicated
    .head(20).reset_index()) #Change "20" to change the number of airports analysed
# Adding airport's city name and state from original dataset
df_top20 = (df_top20.merge(df_AT[["APT_ICAO", "APT_NAME", "STATE_NAME"]]
                           .drop_duplicates(), on="APT_ICAO", how="left"))

airports_code_list = df_top20["APT_ICAO"].tolist()
df_airport_codes = pd.DataFrame({'APT_ICAO': airports_code_list})
#print(df_top20)


In [6]:
df_AT_top20_pred = df_AT[df_AT["APT_ICAO"].isin(airports_code_list)]

df_AT_top20_pred = (
    df_AT_top20_pred.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[["FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1"]] #Group by airport code
    .sum().sort_values(by="FLT_TOT_1",ascending=False) #Sum the values for each code of the 3 columns indicated
    .reset_index())
#print(df_AT_top20_pred)


Prepare Taxis time dataset

In [7]:
df_taxi_time_in = (
    df_TIn[(df_TIn["APT_ICAO"].isin(airports_code_list)) & 
           ((df_TIn["YEAR"] == 2023) | (df_TIn["YEAR"] == 2024))]
    .groupby(["APT_ICAO", "MONTH_NUM"])[["VALID_FL", "TOTAL_REF_NB_FL", "TOTAL_REF_TIME_MIN", "TOTAL_ADD_TIME_MIN"]]
    .sum().drop_duplicates()
    .reset_index()
)
#In order: filter only airports in top 20, filter only values from 2023-24, group by code and sum the values in the 
#listed columns, reset the index to have the codes as a column itselft (useful for merging)

# Rename the columns to for in values
in_cols_to_rename = {
    "VALID_FL": "VALID_FL_IN",
    "TOTAL_REF_NB_FL": "TOTAL_REF_NB_FL_IN",
    "TOTAL_REF_TIME_MIN": "TOT_REF_TIME_MIN_IN",
    "TOTAL_ADD_TIME_MIN": "TOT_ADD_TIME_MIN_IN"
}
df_taxi_time_in = df_taxi_time_in.rename(columns=in_cols_to_rename)
# Merging taxi with airport codes
df_taxi_time_final = pd.merge(df_airport_codes, df_taxi_time_in, on='APT_ICAO', how='left')

df_taxi_time_out = (
    df_TOut[(df_TOut["APT_ICAO"].isin(airports_code_list)) & 
           ((df_TOut["YEAR"] == 2023) | (df_TOut["YEAR"] == 2024))]
    .groupby(["APT_ICAO", "MONTH_NUM"])[["VALID_FL", "TOTAL_REF_NB_FL", "TOTAL_REF_TIME_MIN", "TOTAL_ADD_TIME_MIN"]]
    .sum()
    .drop_duplicates()
    .reset_index()
)
#Same operations as df_taxi_time_in

# Rename the columns to for out values
out_cols_to_rename = {
    "VALID_FL": "VALID_FL_OUT",
    "TOTAL_REF_NB_FL": "TOTAL_REF_NB_FL_OUT",
    "TOTAL_REF_TIME_MIN": "TOT_REF_TIME_MIN_OUT",
    "TOTAL_ADD_TIME_MIN": "TOT_ADD_TIME_MIN_OUT"
}
df_taxi_time_out = df_taxi_time_out.rename(columns=out_cols_to_rename)
# Merging taxi in with taxi out
df_taxi_time = pd.merge(df_taxi_time_final, df_taxi_time_out, on=["APT_ICAO", "MONTH_NUM"], how='left')

#print(df_taxi_time)

Prepare ATFM Dataset, both with average and total causes

In [8]:
df_AAD = df_AAD.dropna()
df_AAD_top20 = df_AAD[df_AAD["APT_ICAO"].isin(airports_code_list)]
ATFM_cols = df_AAD.columns[7:27].tolist() #Columns neded

df_AAD_tot_delays_per_airport = (
    df_AAD_top20.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[ATFM_cols]
    .sum()
    .reset_index()
)

df_AAD_avg_delays_per_airport = (
    df_AAD_top20.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[ATFM_cols]
    .mean()
    .reset_index()
)
#df_AAD_avg_delays_per_airport.head(25)

In [None]:
df_complete = pd.merge(df_AT_top20_pred, df_AAD_tot_delays_per_airport, on=["APT_ICAO", "MONTH_NUM", "Is_Weekend"], how="left")
df_complete = pd.merge(df_complete, df_taxi_time, on=["APT_ICAO", "MONTH_NUM"], how="left")
df_complete.sort_values(by=["APT_ICAO", "MONTH_NUM"])

df_complete["Delay_Rate"] = df_complete.FLT_ARR_1_DLY_15/df_complete.FLT_TOT_1
df_complete["Delay_Prone"] = (df_complete.Delay_Rate > 0.005).astype(int) #0.005 is an example, we need to find it within first 3 questions

df_complete.columns

Creating training dataset (2023-2024): we will predict binary variable "Delay_Prone" using all the other variables in a logistic regression.

HP: we assume that 2023-24-25 are years with similar characteristics (previous ones were influenced by covid)

In [None]:
df = df_complete[["APT_ICAO", "MONTH_NUM", "Is_Weekend", "FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1_x", "Delay_Rate", "Delay_Prone"]].dropna()
airport_mean_delay = df.groupby("APT_ICAO")["Delay_Rate"].mean()
df["ICAO_AVG_DELAY_RATE"] = df["APT_ICAO"].map(airport_mean_delay)
df

## Data normalization and usability

Logistic regression is sensitive to feature scales: large numeric ranges can dominate smaller ones, therefore we need to normalize some of the numerical variables. Moreover we need to check the count of delayed airports to determine whether we have enough data for each category

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
features = ["MONTH_NUM", "Is_Weekend", "FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1_x", "ICAO_AVG_DELAY_RATE"] # Model features
X = df[features] # predictors
y = df["Delay_Prone"] # predicted variable

numeric_features = ["MONTH_NUM", "FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1_x", "ICAO_AVG_DELAY_RATE"] # to be scaled
categorical_features = ["Is_Weekend"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features), # features to be scaled
        ("cat", "passthrough", categorical_features) # features that don't need transformations
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocessor), # Actual scaling
    ("classifier", LogisticRegression(class_weight="balanced")) # Model used, we use balanced weight to moderate class split 
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55) # Dataset split

model.fit(X_train, y_train)# Model training


Model evaluation

In [44]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred)) # (TP + TN)/(FP + FN)
print(confusion_matrix(y_test, y_pred)) # [TP, FP], [FN, TN]
print(classification_report(y_test, y_pred)) 

False Negatives are very high -> The model predicted as not delayed many airports that are actually delayed, while its very good the other way around

Let's add more variables to the model and see if the accuracy increases, without overfitting

In [None]:
df = df.sort_values(by=["APT_ICAO", "MONTH_NUM"])

df["PM_DelayRate"] = df.groupby("APT_ICAO")["ICAO_AVG_DELAY_RATE"].shift(1)
df["PM_FLT_TOT_1"] = df.groupby("APT_ICAO")["FLT_TOT_1"].shift(1)
df["PM_DepRatio"] = df.groupby("APT_ICAO")["FLT_DEP_1"].shift(1) / df.groupby("APT_ICAO")["FLT_TOT_1"].shift(1)
df["PM_ArrRatio"] = df.groupby("APT_ICAO")["FLT_ARR_1_x"].shift(1) / df.groupby("APT_ICAO")["FLT_TOT_1"].shift(1)

df.fillna(0, inplace=True) # Fill NaN for first month of each airport (Jan 2023)


In [None]:
features = ["MONTH_NUM", "Is_Weekend", "FLT_TOT_1", "ICAO_AVG_DELAY_RATE", 
            "PM_DelayRate", "PM_FLT_TOT_1", "PM_DepRatio", "PM_ArrRatio"]
X = df[features]
y = df["Delay_Prone"]

numeric_features = ["MONTH_NUM", "FLT_TOT_1", "ICAO_AVG_DELAY_RATE", 
                    "PM_DelayRate", "PM_FLT_TOT_1", "PM_DepRatio", "PM_ArrRatio"]
categorical_features = ["Is_Weekend"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", "passthrough", categorical_features)
    ]
)

model_2 = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(class_weight="balanced"))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

model_2.fit(X_train, y_train)

In [None]:
y_pred = model_2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy didn't increase, false negatives are still very high!

Up next: (1) try random forest model, (2) change the variables used to predict, (3) try to cross validate results 