In [78]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score, confusion_matrix

In [79]:
raw_df = pd.read_csv("../files/flights.csv", low_memory=False)

In [80]:
# Remove rows with missing ARRIVAL_DELAY values
df = raw_df[raw_df.ARRIVAL_DELAY.isna() == False]

In [81]:
# Keep only necessary columns
# cols = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_DELAY", "SCHEDULED_ARRIVAL", "ARRIVAL_DELAY", "DISTANCE"]
cols = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "ARRIVAL_DELAY", "MONTH", "DAY"]
df = df[cols]

In [82]:
# Get 10k random samples
df = df.sample(n=10000, random_state=42)
print(df.shape)

(10000, 6)


In [83]:
fig = px.histogram(
    df,
    x="ARRIVAL_DELAY",
)
fig.show()

In [84]:
# Classify arrival delays
delay = []
df["DELAY_CLASS"] = (df["ARRIVAL_DELAY"] // 10).astype(int)
df["DELAY_CLASS"] = df["DELAY_CLASS"].clip(lower=-3, upper=18)

In [85]:
X = df.drop(["ARRIVAL_DELAY", "DELAY_CLASS"], axis=1)
y = df["DELAY_CLASS"]

In [86]:
categorical_features = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "MONTH"]
numeric_features = [c for c in X.columns if c not in categorical_features]

In [87]:
# Pipelines
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(cols=categorical_features))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = Pipeline([
    ("preprocess", preprocessor),
    ("rf", RandomForestClassifier(
        n_estimators=100, # Tree number
        random_state=0,
        max_features=4,
        max_depth=5, # Node number
    ))
])

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [89]:
# Train
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,mapping,
,cols,"['AIRLINE', 'ORIGIN_AIRPORT', ...]"
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
# Eval
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [91]:
train_r2 = r2_score(y_train, y_pred_train) * 100
test_r2 = r2_score(y_test, y_pred_test) * 100
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

print(f"Score Train R² -- {train_r2:.2f} % | MAE = {train_mae:.2f} min")
print(f"Score Test  R² -- {test_r2:.2f} % | MAE = {test_mae:.2f} min")

Score Train R² -- -9.92 % | MAE = 1.80 min
Score Test  R² -- -9.68 % | MAE = 1.82 min


Compare real values vs predicted values

In [92]:
df_compare = pd.DataFrame({
    "Actual Class": y_test.reset_index(drop=True),
    "Predicted Class": pd.Series(y_pred_test).reset_index(drop=True)
})

In [93]:
actual_counts = df_compare["Actual Class"].value_counts().sort_index()
pred_counts = df_compare["Predicted Class"].value_counts().sort_index()

delay_labels = {
    -3: "<= -30 min",
    -2: "-30 à -20 min",
    -1: "-20 à -10 min",
     0: "-10 à 0 min",
     1: "0 à 10 min",
     2: "10 à 20 min",
     3: "20 à 30 min",
     4: "30 à 40 min",
     5: "40 à 50 min",
     6: "50 à 60 min",
     7: "60 à 70 min",
     8: "70 à 80 min",
     9: "80 à 90 min",
    10: "90 à 100 min",
    11: "100 à 110 min",
    12: "110 à 120 min",
    13: "120 à 130 min",
    14: "130 à 140 min",
    15: "140 à 150 min",
    16: "150 à 160 min",
    17: "160 à 170 min",
    18: ">= 180 min"
}

actual_counts.index = actual_counts.index.map(delay_labels)
pred_counts.index = pred_counts.index.map(delay_labels)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=actual_counts.index,
    y=actual_counts.values,
    name="Actual"
))

fig.add_trace(go.Bar(
    x=pred_counts.index,
    y=pred_counts.values,
    name="Predicted"
))

fig.update_layout(
    title="Distribution of Actual vs Predicted Delay Classes",
    xaxis_title="Delay Class (10-minute bins)",
    yaxis_title="Count",
    barmode="group"
)

fig.show()
