In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt 
import xgboost as xgb #use xgb as algorthm model
from xgboost import XGBRegressor #use xgb regressor because we want to predict continuous numerical value
from sklearn.preprocessing import LabelEncoder #encode all categorical and boolean feature
import optuna # hyperparameter tuning using optuna
from sklearn.model_selection import cross_val_score #to evaluate model
from sklearn.model_selection import StratifiedKFold #to perform k-fold

In [6]:
data_train = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')

In [7]:
#divide dataset into 4
y_train = data_train['accident_risk'] #target train
train_ids = data_train['id'] #target train id
test_ids = data_test['id'] #target test id
train = data_train.drop(['accident_risk','id'],axis=1) #train feature 
test = data_test.drop(['id'],axis=1) # test feature

In [8]:
X_train = train.copy()
X_test = test.copy()

In [9]:
#feature encoding

#first select categorical,object and boolean column
cat_col = X_train.select_dtypes(['object','bool']).columns

In [10]:
#label encode

#in test data we use only transform, to prevent inconsistency, like example: in training blue encoded as 1,
#so when in test dataset the blue also encoded as 1 not 0 or other number.
for col in cat_col:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


In [11]:
def objective(trial):

    #optuna will search the best output given a specified range of numbers
    xgb_params = {
        'objective': 'reg:squarederror',
        'max_bin': trial.suggest_int('max_bin',100,600),
        'learning_rate': trial.suggest_float('learning_rate',0.01,0.1),
        'max_depth': trial.suggest_int('max_depth',1,10),
        'min_child_weight': trial.suggest_int('min_child_weight',1,10),
        'subsample': trial.suggest_float('subsample',0.1,1),
        'colsample_bytree': trial.suggest_float('colsample_bytree',0.1,1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel',0.1,1),
        'colsample_bynode': trial.suggest_float('colsample_bynode',0.1,1),
        'reg_alpha': trial.suggest_float('reg_alpha',0.1,1),
        'reg_lambda': trial.suggest_float('reg_lambda',0.1,1),
        'gamma': trial.suggest_float('gamma',0.1,1),
        'max_delta_step': trial.suggest_int('max_delta_step',0,10),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight',0.1,1)
    }

    model = XGBRegressor(**xgb_params,tree_method='hist',device='cuda') #configure xgbregressor model

    #evaluate model
    score = cross_val_score(model,X_train,y_train,cv=7,scoring='neg_mean_absolute_error')#cross_val_score do .fit() internally
    mae = -score.mean()
    return mae


In [12]:
#configure optuna
study = optuna.create_study(study_name='xgboost_optuna_roadaccident_comp',direction='minimize')
study.optimize(objective,n_trials=100,show_progress_bar=True,n_jobs=-1) #njobs -1 to run on all cores to speed up process


[I 2025-11-26 15:59:09,882] A new study created in memory with name: xgboost_optuna_roadaccident_comp


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-26 15:59:37,084] Trial 4 finished with value: 0.058268797219213096 and parameters: {'max_bin': 213, 'learning_rate': 0.09347719881990574, 'max_depth': 1, 'min_child_weight': 9, 'subsample': 0.2597858561641704, 'colsample_bytree': 0.42738351614761105, 'colsample_bylevel': 0.7100252319310371, 'colsample_bynode': 0.7750520660361881, 'reg_alpha': 0.20525743484586445, 'reg_lambda': 0.2623039572248737, 'gamma': 0.2806549194213439, 'max_delta_step': 10, 'scale_pos_weight': 0.44274758840825346}. Best is trial 4 with value: 0.058268797219213096.
[I 2025-11-26 15:59:52,964] Trial 0 finished with value: 0.06654606655315733 and parameters: {'max_bin': 102, 'learning_rate': 0.07263713998269804, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.8095788193688069, 'colsample_bytree': 0.555356779843612, 'colsample_bylevel': 0.12491783476989698, 'colsample_bynode': 0.8084693944007352, 'reg_alpha': 0.3767640254501896, 'reg_lambda': 0.8355268951511426, 'gamma': 0.5901376836188226, 'max_delt

stratified k-fold

In [13]:
#use 7 fold
FOLD = 7

In [14]:
#stratification bins
#In practice, many data scientists use standard K-fold for regression first, 
# and only implement binned stratification if the results show high variance across folds 
# or if the target distribution is highly imbalanced.
#q=10 best practice, and must be q > k.(q  = 10 must be larger than k = 7 fold)
#this q will also have positive relationship with data,if data is large q also should be increased

y_bins = pd.qcut(y_train,q=10,labels=False,duplicates='drop')

In [15]:
skf = StratifiedKFold(n_splits=FOLD,shuffle=True,random_state=42)
fold_splits = list(skf.split(X_train,y_bins))

In [16]:
fold_splits

[(array([     0,      1,      2, ..., 517749, 517750, 517751]),
  array([     8,     17,     43, ..., 517728, 517752, 517753])),
 (array([     1,      2,      4, ..., 517751, 517752, 517753]),
  array([     0,      3,      7, ..., 517726, 517735, 517737])),
 (array([     0,      1,      3, ..., 517751, 517752, 517753]),
  array([     2,     32,     35, ..., 517732, 517745, 517750])),
 (array([     0,      1,      2, ..., 517751, 517752, 517753]),
  array([     9,     16,     21, ..., 517743, 517744, 517749])),
 (array([     0,      1,      2, ..., 517751, 517752, 517753]),
  array([     4,     12,     13, ..., 517741, 517742, 517746])),
 (array([     0,      1,      2, ..., 517751, 517752, 517753]),
  array([     5,     10,     11, ..., 517731, 517733, 517747])),
 (array([     0,      2,      3, ..., 517750, 517752, 517753]),
  array([     1,      6,     23, ..., 517740, 517748, 517751]))]

In [17]:
oof_prediction = np.zeros(len(X_train))
test_prediction = np.zeros(len(X_test))

fold_scores = []
# feature_importance_dict = {}

In [18]:
for fold,(train_idx,val_idx) in enumerate(fold_splits,1):
    print(f'Fold {fold}/{FOLD}')

    X_tr,X_val = X_train.iloc[train_idx],X_train.iloc[val_idx]
    y_tr,y_val = y_train.iloc[train_idx],y_train.iloc[val_idx]

    dtrain = xgb.DMatrix(X_tr,label=y_tr)
    dval = xgb.DMatrix(X_val,label=y_val)
    dtest = xgb.DMatrix(X_test)

    model = xgb.train(
        params = study.best_params,
        dtrain = dtrain,
        num_boost_round = 10000,
        evals = [(dval,'valid')],
        early_stopping_rounds = 200,
        verbose_eval = False
    )

    oof_prediction[val_idx] = model.predict(dval)
    test_prediction += model.predict(dtest)

    fold_rmse = np.sqrt(np.mean((oof_prediction[val_idx] - y_val)**2))
    fold_scores.append(fold_rmse)
    print(f'FOLD {fold} RMSE:{fold_rmse:.6f}')

test_prediction/=FOLD

Fold 1/7
FOLD 1 RMSE:0.056769
Fold 2/7
FOLD 2 RMSE:0.056292
Fold 3/7
FOLD 3 RMSE:0.056503
Fold 4/7
FOLD 4 RMSE:0.056268
Fold 5/7
FOLD 5 RMSE:0.056907
Fold 6/7
FOLD 6 RMSE:0.056689
Fold 7/7
FOLD 7 RMSE:0.056670


In [19]:
print(f'Average RMSE:{np.mean(fold_scores):.4f}')

Average RMSE:0.0566


In [20]:
print(test_prediction)

[0.2942352  0.13062514 0.19341974 ... 0.25150612 0.13364523 0.48470294]
