# Q1: What type of delays are occurring most at the airports? 

In [1]:
import pandas as pd
from pathlib import Path

In [None]:
FILE = Path("datasets/AA_ATFM_Delay.xlsx")
SHEET = "Blad1"
YEARS = [2023, 2024]   # use [2023] for only one year
df_AT = pd.read_excel("datasets/AirportTraffic.xlsx")

In [3]:
#setting the letter into what it is
delay_mapping = {
    "A": "Accident/Incident",
    "C": "ATC Capacity",
    "D": "De-icing",
    "E": "Equipment (non-ATC)",
    "G": "Aerodrome Capacity",
    "I": "Industrial Action (ATC)",
    "M": "Airspace Management",
    "N": "Industrial Action (non-ATC)",
    "O": "Other",
    "P": "Special Event",
    "R": "ATC Routeing",
    "S": "ATC Staffing",
    "T": "Equipment (ATC)",
    "V": "Environmental Issues",
    "W": "Weather",
    "NA": "Not specified"
}

In [4]:
df_top20 = (
    df_AT.groupby("APT_ICAO")[["FLT_TOT_1", "FLT_DEP_1", "FLT_ARR_1"]] #Group by airport code
    .sum().sort_values(by="FLT_TOT_1",ascending=False) #Sum the values for each code of the 3 columns indicated
    .head(20).reset_index()) #Change "20" to change the number of airports analysed
# Adding airport's city name and state from original dataset
df_top20 = (df_top20.merge(df_AT[["APT_ICAO", "APT_NAME", "STATE_NAME"]]
                           .drop_duplicates(), on="APT_ICAO", how="left"))
airports_code_list = df_top20["APT_ICAO"].tolist()

In [None]:
df = pd.read_excel(FILE, sheet_name=SHEET)
df = df[df["YEAR"].isin(YEARS) & df["APT_ICAO"].isin(airports_code_list)].copy()

In [None]:
#what columns
delay_cols = sorted([
    c for c in df.columns
    if c.startswith("DLY_APT_ARR_") and c.endswith("_1") and c != "DLY_APT_ARR_1"
])

#FIND MOST-OCCURRING TYPE PER (airport, year, month)(idk chat did this)
occ = (df[delay_cols].fillna(0) > 0)
group_keys = ["APT_ICAO", "YEAR", "MONTH_NUM"]
counts = occ.groupby([df[k] for k in group_keys]).sum()

best_col = counts.idxmax(axis=1)
best_code = (best_col.str.replace("DLY_APT_ARR_", "", regex=False)
                      .str.replace("_1", "", regex=False))

#table making
long_df = (
    best_code.reset_index(name="delay_code")
    .assign(
        Year=lambda d: d["YEAR"].astype(str),
        Month=lambda d: d["MONTH_NUM"].astype(str).str.zfill(2),
        delay_type=lambda d: d["delay_code"].map(delay_mapping)
    )
    .loc[:, ["Year", "Month", "APT_ICAO", "delay_type"]]
    .sort_values(["APT_ICAO", "Year", "Month"])
    .set_index("APT_ICAO")
)

#printing, but maybe remove it before committing?
display(long_df)
print("Rows:", len(long_df))

Let's add some visualisation: 
- What's the split between 2023 and 2024 (pie charts or histograms), are they similar? 
- What's the most frequent reason for delay for each airport in between 23-24?
- Based off last question, how many airport don't suffer from exogenous causes (meaning the can control what's going wrong)? -> ask JP for endogenous/exogenous causes