In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


In [26]:
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [27]:
df = pd.read_csv("/kaggle/input/playground-series-s5e10/train.csv")
df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [28]:
df = pd.get_dummies(df, columns=['weather', 'time_of_day', 'lighting', 'road_type'], drop_first=True)
df.head()

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning,lighting_dim,lighting_night,road_type_rural,road_type_urban
0,0,2,0.06,35,False,True,False,True,1,0.13,False,True,False,False,False,False,False,True
1,1,4,0.99,35,True,False,True,True,0,0.35,False,False,True,False,False,False,False,True
2,2,4,0.63,70,False,True,True,False,2,0.3,False,False,False,True,True,False,True,False
3,3,4,0.07,35,True,True,False,False,1,0.21,False,True,False,True,True,False,False,False
4,4,1,0.58,60,False,False,True,False,1,0.56,True,False,True,False,False,False,True,False


In [29]:
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
df.head()

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning,lighting_dim,lighting_night,road_type_rural,road_type_urban
0,0,2,0.06,35,0,1,0,1,1,0.13,0,1,0,0,0,0,0,1
1,1,4,0.99,35,1,0,1,1,0,0.35,0,0,1,0,0,0,0,1
2,2,4,0.63,70,0,1,1,0,2,0.3,0,0,0,1,1,0,1,0
3,3,4,0.07,35,1,1,0,0,1,0.21,0,1,0,1,1,0,0,0
4,4,1,0.58,60,0,0,1,0,1,0.56,1,0,1,0,0,0,1,0


In [30]:
#df['speed_accident_rate'] = df['speed_limit'] / (df['num_reported_accidents'] + 1)
#df['lane_curvature'] = df['num_lanes'] * df['curvature']

In [31]:
X = df.drop(columns=['accident_risk', 'id'])
y = df['accident_risk']

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [33]:
X_train.head()

Unnamed: 0,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning,lighting_dim,lighting_night,road_type_rural,road_type_urban
480272,4,0.52,45,0,0,1,0,0,1,0,1,0,0,0,0,0
147591,1,0.15,35,0,1,0,0,3,1,0,0,1,1,0,1,0
283894,2,0.56,45,0,1,1,1,0,0,1,1,0,1,0,1,0
25819,3,0.11,60,0,0,1,0,1,0,0,1,0,0,0,1,0
486351,4,0.24,60,1,1,1,1,2,1,0,0,1,1,0,1,0


In [34]:
xgb = XGBRegressor(seed=42)

In [35]:
import optuna
def objective(trial):
    params = {
        'booster': 'gbtree',
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 2),
    }
    model = xgb.set_params(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

[I 2025-10-30 17:02:05,571] A new study created in memory with name: no-name-41333af6-1a0c-40d6-a735-fbb2107c6f89
[I 2025-10-30 17:02:11,016] Trial 0 finished with value: 0.06908393527107495 and parameters: {'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.011301749530280025, 'subsample': 0.6085810221899469, 'colsample_bytree': 0.8269832983387759, 'gamma': 0.13372258771347628, 'min_child_weight': 5, 'reg_alpha': 0.6028548911299534, 'reg_lambda': 1.3505475323495468}. Best is trial 0 with value: 0.06908393527107495.
[I 2025-10-30 17:02:18,862] Trial 1 finished with value: 0.14386009890118756 and parameters: {'n_estimators': 242, 'max_depth': 9, 'learning_rate': 0.0010948517525255247, 'subsample': 0.6418028215187318, 'colsample_bytree': 0.6325940040662698, 'gamma': 0.18980247533096353, 'min_child_weight': 6, 'reg_alpha': 0.168378098182146, 'reg_lambda': 1.3296521885858812}. Best is trial 0 with value: 0.06908393527107495.
[I 2025-10-30 17:02:24,018] Trial 2 finished with value: 0.1

Best RMSE: 0.056210611899265045
Best params: {'n_estimators': 197, 'max_depth': 9, 'learning_rate': 0.06420807974292006, 'subsample': 0.9592087481779927, 'colsample_bytree': 0.9124772156195039, 'gamma': 0.010206585913435005, 'min_child_weight': 1, 'reg_alpha': 0.07716363628099902, 'reg_lambda': 1.261018485648577}


In [23]:
0.05625724647123818 # additional things
0.05621448815220292 # drop_first = False
0.056210611899265045 # drop_first = True

0.05625724647123818

In [36]:
X_test = pd.read_csv("/kaggle/input/playground-series-s5e10/test.csv")

In [37]:
X_test_transform = pd.get_dummies(X_test, columns=['weather', 'time_of_day', 'lighting', 'road_type'], drop_first=True)
bool_cols = X_test_transform.select_dtypes(include='bool').columns
X_test_transform[bool_cols] = X_test_transform[bool_cols].astype(int)
X_test_transform.head()

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning,lighting_dim,lighting_night,road_type_rural,road_type_urban
0,517754,2,0.34,45,1,1,1,1,1,0,0,0,0,0,1,0,0
1,517755,3,0.04,45,1,0,1,0,0,1,0,0,0,1,0,0,1
2,517756,2,0.59,35,1,0,1,1,1,0,0,0,0,1,0,0,1
3,517757,4,0.95,35,0,0,0,0,2,0,1,0,0,0,0,1,0
4,517758,2,0.86,35,1,0,0,1,3,0,0,1,0,0,0,0,0


In [38]:
X_test_transform = X_test_transform.drop(columns=['id'])

In [39]:
xgb.set_params(**study.best_params).fit(X_train, y_train)

In [40]:
y_final = xgb.predict(X_test_transform)

In [43]:
submission = {"id":X_test["id"], "accident_risk":y_final}
submission = pd.DataFrame(submission)
submission.head()

Unnamed: 0,id,accident_risk
0,517754,0.293638
1,517755,0.120979
2,517756,0.182763
3,517757,0.317979
4,517758,0.412069


In [44]:
submission.to_csv("submission.csv", index=False)