In [1]:
import pandas as pd
from xgboost import XGBRegressor
import optuna 
from sklearn.metrics import mean_absolute_error , r2_score 
from sklearn.model_selection import train_test_split , cross_val_score
import joblib
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('flight_transformed_final.csv')

In [3]:
df.head()

Unnamed: 0,scale__days_left,stops_oe__stops,arrival_time_oe__arrival_time,class_oe__class,departure_time_oe__departure_time,OHE__airline_Air_India,OHE__airline_GO_FIRST,OHE__airline_Indigo,OHE__airline_SpiceJet,OHE__airline_Vistara,...,OHE__source_city_Hyderabad,OHE__source_city_Kolkata,OHE__source_city_Mumbai,OHE__destination_city_Chennai,OHE__destination_city_Delhi,OHE__destination_city_Hyderabad,OHE__destination_city_Kolkata,OHE__destination_city_Mumbai,remainder__route_avg_duration,price
0,-1.855569,0.0,4.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.367774,5953
1,-1.855569,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.367774,5953
2,-1.855569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.367774,5956
3,-1.855569,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.367774,5955
4,-1.855569,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.367774,5955


In [4]:
X = df.drop("price", axis=1)
y = df["price"]

In [5]:
X = X.drop(columns=["remainder__route_avg_duration"])


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
X_train.shape

(237714, 20)

In [8]:
y_train.shape

(237714,)

In [9]:
def objective(trail):
    params = {
        "n_estimators" : trail.suggest_int('n_estimators', 50, 200),
        "max_depth" : trail.suggest_int('max_depth', 3, 20),
        "colsample_bytree": trail.suggest_float("colsample_bytree", 0.6, 1.0), #features used per tree
        "learning_rate" : trail.suggest_float('learning_rate' , 0.01 , 0.3)
    }

    model = XGBRegressor(**params)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean() #this does the fitting work
   
    return score


In [None]:
study = optuna.create_study(direction="maximize" , sampler=optuna.samplers.TPESampler())
study.optimize(objective , n_trials=50)

[32m[I 2026-01-29 23:37:36,858][0m A new study created in memory with name: no-name-39371c6d-9476-4fd1-b994-a4693e47b19f[0m
[32m[I 2026-01-29 23:37:41,232][0m Trial 0 finished with value: 0.9729943752288819 and parameters: {'n_estimators': 156, 'max_depth': 8, 'colsample_bytree': 0.8789682034852455, 'learning_rate': 0.05511215622112854}. Best is trial 0 with value: 0.9729943752288819.[0m
[32m[I 2026-01-29 23:37:42,513][0m Trial 1 finished with value: 0.9655214309692383 and parameters: {'n_estimators': 92, 'max_depth': 4, 'colsample_bytree': 0.9454729581657033, 'learning_rate': 0.267846256280183}. Best is trial 0 with value: 0.9729943752288819.[0m
[32m[I 2026-01-29 23:37:45,559][0m Trial 2 finished with value: 0.9757776379585266 and parameters: {'n_estimators': 85, 'max_depth': 10, 'colsample_bytree': 0.6909280137862954, 'learning_rate': 0.22423204326951401}. Best is trial 2 with value: 0.9757776379585266.[0m
[32m[I 2026-01-29 23:37:53,677][0m Trial 3 finished with value: 

In [None]:
print("Best Parameters:", study.best_params)
print("Best RMSE:", study.best_value)

In [None]:


final_model = XGBRegressor(n_estimators = 200 , max_depth = 11, colsample_bytree = 0.747975912969109, learning_rate = 0.08643568125213257)
final_model.fit(X_train , y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.747975912969109
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
f_pred = final_model.predict(X_test)

In [None]:
print("R2 Score:", r2_score(y_test, f_pred))
print("MAE:", mean_absolute_error(y_test, f_pred))
mae_error = mean_absolute_error(y_test , f_pred)

R2 Score: 0.9768717885017395
MAE: 1839.32177734375


In [None]:
features = final_model.feature_importances_

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(final_model)


In [None]:
# Sample to make it fast
X_shap = X_train.sample(1000, random_state=42)

shap_values = explainer.shap_values(X_shap)


In [None]:
joblib.dump(final_model , "final_model.pkl")

['final_model.pkl']