## Parameters
For predicting the delay of a flight we need: 

In [3]:
import pandas as pd
import numpy as np

In [4]:
df_AT = pd.read_excel("datasets/AirportTraffic.xlsx")
df_TIn = pd.read_excel("datasets/Taxi-In_Additional_Time.xlsx")
df_TOut = pd.read_excel("datasets/Taxi-Out_Additional_Time.xlsx")
df_AAD = pd.read_excel("datasets/AA_ATFM_Delay.xlsx")

In [36]:
df_AAD["FLT_DATE"] = df_AT["FLT_DATE"].dt.strftime("%-d-%b-%Y")
df_AAD["Is_Weekend"] = df_AT["FLT_DATE"].dt.dayofweek.isin([5, 6]).astype(int)

In [38]:
df_AT['FLT_DATE'] = pd.to_datetime(df_AT['FLT_DATE'])
day_of_week = df_AT['FLT_DATE'].dt.dayofweek

# Create the new column ('Is_Weekend'): 1 yes, 0 no
df_AT['Is_Weekend'] = np.where(day_of_week >= 5, 1, 0)
df_AT["Is_Weekend"] = df_AT["Is_Weekend"].astype(int)

Prepare AT dataset

In [39]:
df_top20 = (
    df_AT.groupby(["APT_ICAO"])[["FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1"]] #Group by airport code
    .sum().sort_values(by="FLT_TOT_1",ascending=False) #Sum the values for each code of the 3 columns indicated
    .head(20).reset_index()) #Change "20" to change the number of airports analysed
# Adding airport's city name and state from original dataset
df_top20 = (df_top20.merge(df_AT[["APT_ICAO", "APT_NAME", "STATE_NAME"]]
                           .drop_duplicates(), on="APT_ICAO", how="left"))

airports_code_list = df_top20["APT_ICAO"].tolist()
df_airport_codes = pd.DataFrame({'APT_ICAO': airports_code_list})
#print(df_top20)


In [52]:
df_AT_top20_pred = df_AT[df_AT["APT_ICAO"].isin(airports_code_list)]

df_AT_top20_pred = (
    df_AT_top20_pred.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[["FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1"]] #Group by airport code
    .sum().sort_values(by="FLT_TOT_1",ascending=False) #Sum the values for each code of the 3 columns indicated
    .reset_index())
#print(df_AT_top20_pred)


Prepare Taxis time dataset

In [47]:
df_taxi_time_in = (
    df_TIn[(df_TIn["APT_ICAO"].isin(airports_code_list)) & 
           ((df_TIn["YEAR"] == 2023) | (df_TIn["YEAR"] == 2024))]
    .groupby(["APT_ICAO", "MONTH_NUM"])[["VALID_FL", "TOTAL_REF_NB_FL", "TOTAL_REF_TIME_MIN", "TOTAL_ADD_TIME_MIN"]]
    .sum().drop_duplicates()
    .reset_index()
)
#In order: filter only airports in top 20, filter only values from 2023-24, group by code and sum the values in the 
#listed columns, reset the index to have the codes as a column itselft (useful for merging)

# Rename the columns to for in values
in_cols_to_rename = {
    "VALID_FL": "VALID_FL_IN",
    "TOTAL_REF_NB_FL": "TOTAL_REF_NB_FL_IN",
    "TOTAL_REF_TIME_MIN": "TOT_REF_TIME_MIN_IN",
    "TOTAL_ADD_TIME_MIN": "TOT_ADD_TIME_MIN_IN"
}
df_taxi_time_in = df_taxi_time_in.rename(columns=in_cols_to_rename)
# Merging taxi with airport codes
df_taxi_time_final = pd.merge(df_airport_codes, df_taxi_time_in, on='APT_ICAO', how='left')

df_taxi_time_out = (
    df_TOut[(df_TOut["APT_ICAO"].isin(airports_code_list)) & 
           ((df_TOut["YEAR"] == 2023) | (df_TOut["YEAR"] == 2024))]
    .groupby(["APT_ICAO", "MONTH_NUM"])[["VALID_FL", "TOTAL_REF_NB_FL", "TOTAL_REF_TIME_MIN", "TOTAL_ADD_TIME_MIN"]]
    .sum()
    .drop_duplicates()
    .reset_index()
)
#Same operations as df_taxi_time_in

# Rename the columns to for out values
out_cols_to_rename = {
    "VALID_FL": "VALID_FL_OUT",
    "TOTAL_REF_NB_FL": "TOTAL_REF_NB_FL_OUT",
    "TOTAL_REF_TIME_MIN": "TOT_REF_TIME_MIN_OUT",
    "TOTAL_ADD_TIME_MIN": "TOT_ADD_TIME_MIN_OUT"
}
df_taxi_time_out = df_taxi_time_out.rename(columns=out_cols_to_rename)
# Merging taxi in with taxi out
df_taxi_time = pd.merge(df_taxi_time_final, df_taxi_time_out, on=["APT_ICAO", "MONTH_NUM"], how='left')

#print(df_taxi_time)

Prepare ATFM Dataset, both with average and total causes

In [56]:
df_AAD = df_AAD.dropna()
df_AAD_top20 = df_AAD[df_AAD["APT_ICAO"].isin(airports_code_list)]
ATFM_cols = df_AAD.columns[7:27].tolist() #Columns neded

df_AAD_tot_delays_per_airport = (
    df_AAD_top20.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[ATFM_cols]
    .sum()
    .reset_index()
)

df_AAD_avg_delays_per_airport = (
    df_AAD_top20.groupby(["APT_ICAO", "MONTH_NUM", "Is_Weekend"])[ATFM_cols]
    .mean()
    .reset_index()
)
#df_AAD_avg_delays_per_airport.head(25)

In [None]:
df = pd.merge(df_AT_top20_pred, df_AAD_avg_delays_per_airport, on=["APT_ICAO", "MONTH_NUM", "Is_Weekend"], how="left")
df = pd.merge(df, df_taxi_time, on=["APT_ICAO", "MONTH_NUM"], how="left")
df.sort_values(by=["APT_ICAO", "MONTH_NUM"]).head()