In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from hyperopt import fmin, tpe, hp, Trials
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv', index_col='id')
df.head()

Unnamed: 0_level_0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [47]:
df.describe()

Unnamed: 0,num_lanes,curvature,speed_limit,num_reported_accidents,accident_risk
count,517754.0,517754.0,517754.0,517754.0,517754.0
mean,2.491511,0.488719,46.112575,1.18797,0.352377
std,1.120434,0.272563,15.788521,0.895961,0.166417
min,1.0,0.0,25.0,0.0,0.0
25%,1.0,0.26,35.0,1.0,0.23
50%,2.0,0.51,45.0,1.0,0.34
75%,3.0,0.71,60.0,2.0,0.46
max,4.0,1.0,70.0,7.0,1.0


In [7]:
columns = ['road_type', 'num_lanes', 'speed_limit', 'weather', 
           'road_signs_present', 'time_of_day', 'curvature']
unused = df.columns.difference(columns)
unused

Index(['accident_risk', 'holiday', 'lighting', 'num_reported_accidents',
       'public_road', 'school_season'],
      dtype='object')

In [28]:
df['road_signs_present'].dtype

dtype('bool')

In [48]:
df.duplicated().sum() / len(df)

0.0012670109743237136

In [29]:
df = df.drop_duplicates(keep='first')

In [30]:
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['number']).columns.difference(['accident_risk'])
bool_cols = df.select_dtypes(include=['bool']).columns
target = 'accident_risk'

In [39]:
X = df.drop([target], axis=1)
y = df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
def to_int(x):
    return x.astype(int)

bool_transformer = FunctionTransformer(to_int)

ct = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(), cat_cols),
    ('bool', bool_transformer, bool_cols)
])

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 413678 entries, 169475 to 121991
Data columns (total 12 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   road_type               413678 non-null  object 
 1   num_lanes               413678 non-null  int64  
 2   curvature               413678 non-null  float64
 3   speed_limit             413678 non-null  int64  
 4   lighting                413678 non-null  object 
 5   weather                 413678 non-null  object 
 6   road_signs_present      413678 non-null  bool   
 7   public_road             413678 non-null  bool   
 8   time_of_day             413678 non-null  object 
 9   holiday                 413678 non-null  bool   
 10  school_season           413678 non-null  bool   
 11  num_reported_accidents  413678 non-null  int64  
dtypes: bool(4), float64(1), int64(3), object(4)
memory usage: 30.0+ MB


In [40]:
X_train = ct.fit_transform(X_train)
X_val = ct.transform(X_val)

In [41]:
import joblib
joblib.dump(ct, 'preprocessor.pkl')

['preprocessor.pkl']

In [34]:
len(X_train[0])

20

In [42]:
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train,
         eval_set=[(X_val, y_val)],
         early_stopping_rounds=50, 
         verbose=False)

In [43]:
model.save_model('xgb_model.json')

In [73]:
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Squared Error on Test Set: {mse:.4f}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse:.4f}')

Mean Squared Error on Test Set: 0.0031
Root Mean Squared Error (RMSE) on Test Set: 0.0559


In [75]:
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth'])

    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        **params
    )
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions)
    
    return rmse

In [76]:
# Define the hyperparameter search space
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 2000, 100),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, 0), 
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'reg_alpha': hp.loguniform('reg_alpha', -10, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -10, 0)
}

In [80]:
trials = Trials()

best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

print("Best hyperparameters found:")
print(best_params)

100%|██████████| 100/100 [32:25<00:00, 19.45s/trial, best loss: 0.003124803233778234]
Best hyperparameters found:
{'colsample_bytree': 0.922363755871093, 'gamma': 0.009008383559465531, 'learning_rate': 0.0907049622492688, 'max_depth': 8.0, 'n_estimators': 1500.0, 'reg_alpha': 9.116323094032606e-05, 'reg_lambda': 0.19992042136473398, 'subsample': 0.8285500447152042}


In [81]:
best_xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    gamma=best_params['gamma'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda']
)

best_xgb_model.fit(X_train, y_train,
                   eval_set=[(X_val, y_val)],
                   early_stopping_rounds=50,
                   verbose=False)

# Evaluate the final model
test_predictions = best_xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, test_predictions, squared=False)

print(f"Final RMSE on the test set with optimized parameters: {rmse:.4f}")

Final RMSE on the test set with optimized parameters: 0.0559


In [82]:
test_df = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
cat_cols = test_df.select_dtypes(include=['object']).columns
num_cols = test_df.select_dtypes(include=['number']).columns
bool_cols = test_df.select_dtypes(include=['bool']).columns

In [84]:
X_test = ct.transform(test_df)

In [85]:
pred = best_xgb_model.predict(X_test)

In [86]:
submission_df = pd.DataFrame({'id': test_df['id'], 'accident_risk': pred})
submission_df.to_csv('submission_xgb1.csv', index=False)

In [87]:
import pickle

with open("model_xgb.pkl", "wb") as f:
    pickle.dump(model, f)


In [88]:
df['curvature'].describe()

count    517098.000000
mean          0.488749
std           0.272570
min           0.000000
25%           0.260000
50%           0.510000
75%           0.710000
max           1.000000
Name: curvature, dtype: float64