In [255]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [256]:
raw_df = pd.read_csv("./files/flights.csv", low_memory=False)

In [257]:
# Remove rows with missing ARRIVAL_DELAY values
df = raw_df[raw_df.ARRIVAL_DELAY.isna() == False]

In [258]:
# Keep only necessary columns
# cols = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "SCHEDULED_DEPARTURE", "DEPARTURE_DELAY", "SCHEDULED_ARRIVAL", "ARRIVAL_DELAY", "DISTANCE"]
cols = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "DEPARTURE_DELAY", "ARRIVAL_DELAY"]
df = df[cols]

In [259]:
# Get 10k random samples
df = df.sample(n=10000, random_state=42)
print(df.shape)

(10000, 5)


In [260]:
# fig = px.histogram(df, x=df.MONTH, y="ARRIVAL_DELAY")
# fig.show();

In [261]:
X = df.drop("ARRIVAL_DELAY", axis=1)
y = df["ARRIVAL_DELAY"]

In [262]:
categorical_features = ["ORIGIN_AIRPORT", "DESTINATION_AIRPORT", "AIRLINE"]
numeric_features = [c for c in X.columns if c not in categorical_features]

In [263]:
# Pipelines
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(cols=categorical_features))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

model = Pipeline([
    ("preprocess", preprocessor),
    ("rf", RandomForestRegressor(
        n_estimators=100, # Tree number
        random_state=0,
        max_features=2,
        # max_depth=20, # Node number
    ))
])

In [264]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [265]:
# Train
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,mapping,
,cols,"['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', ...]"
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,2
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [266]:
# Eval
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [267]:
train_r2 = r2_score(y_train, y_pred_train) * 100
test_r2 = r2_score(y_test, y_pred_test) * 100
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

print(f"Score Train R² -- {train_r2:.2f} % | MAE = {train_mae:.2f} min")
print(f"Score Test  R² -- {test_r2:.2f} % | MAE = {test_mae:.2f} min")

Score Train R² -- 97.59 % | MAE = 3.86 min
Score Test  R² -- 86.19 % | MAE = 9.85 min


Compare real values vs predicted values

In [268]:
df_compare = pd.DataFrame({
    "Actual Delay": y_test,
    "Predicted Delay": y_pred_test
}).reset_index(drop=True)

In [271]:
fig = px.scatter(
    df_compare,
    x="Actual Delay",
    y="Predicted Delay",
    title="Predicted vs Actual Arrival Delay",
    opacity=0.5,
)

# Ligne parfaite (y = x)
fig.add_shape(
    type="line",
    x0=df_compare["Actual Delay"].min(),
    y0=df_compare["Actual Delay"].min(),
    x1=df_compare["Actual Delay"].max(),
    y1=df_compare["Actual Delay"].max(),
    line=dict(dash="dot")
)

fig.show()

In [270]:
df_compare["Error (min)"] = df_compare["Actual Delay"] - df_compare["Predicted Delay"]

fig_err = px.histogram(
    df_compare,
    x="Error (min)",
    nbins=60,
    title="Distribution of Prediction Error (minutes)",
)

fig_err.show()