In [None]:
import joblib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
data = pd.read_csv(r"../data/cleaned_data.csv")
data

In [None]:
features  = ["DayOfWeekEncoded", "Month", "CRSDepTimeMinutes",  "CRSArrTimeMinutes",  "Distance",  "CRSElapsedTime",  "FlightIDEncoded",  "TaxiOut",  "DepTimeMinutes_DayOfWeekEncoded", "CRSArrTimeMinutes_DayOfWeekEncoded", "Origin_Dep_Count", "Dest_Arr_Count"]
target = "ArrDelay"

data = pd.get_dummies(data, columns=["UniqueCarrier"], drop_first=True)

In [None]:
features = features + [col for col in data.columns if col.startswith("UniqueCarrier_")]
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)

In [None]:
model = RandomForestRegressor(random_state=777)
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

with joblib.parallel_backend("threading"):
    grid_search.fit(X_train, y_train)

joblib.dump(grid_search.best_estimator_, r"../models/grid_search_best_estimator_v3.pkl")

In [None]:
best_model = joblib.load(r"../models/grid_search_best_estimator_v3.pkl")

In [None]:
best_model.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R2 Score: {r2}")

In [None]:
feature_importance = pd.Series(best_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("Feature Importance:\n", feature_importance)

In [None]:
joblib.dump(best_model, r"../models/random_forest_regressor_tuned_v3.pkl")