In [None]:
import numpy as np
import pandas as pd

In [None]:
train_df = pd.read_csv(r"D:\kaggle datasets\playground s5ep10\playground-series-s5e10\train.csv")
test_df = pd.read_csv(r"D:\kaggle datasets\playground s5ep10\playground-series-s5e10\test.csv")

In [None]:
test_df.head(10)

In [None]:
y = train_df['accident_risk']
x = train_df.drop(columns=['accident_risk','id'])
x_test = test_df.drop(columns=['id'])

In [None]:
combined_df = pd.concat([x,x_test],axis=0)

In [None]:
# boolean encodeing
bool_cols = combined_df.select_dtypes(include=['bool']).columns
combined_df[bool_cols] = combined_df[bool_cols].astype(int)

# ordinal(ordered) encoding
lighting_mapping = {'night':0,'dim':1,'daylight':2}
combined_df['lighting'] = combined_df['lighting'].map(lighting_mapping)

time_of_day_mapping = {'morning':0,'afternoon':1,'evening':2}
combined_df['time_of_day'] = combined_df['time_of_day'].map(time_of_day_mapping)

#nominal(unordered) one hot encoding
nominal_cols = ['road_type','weather']
combined_df = pd.get_dummies(combined_df,columns=nominal_cols,drop_first=True)

x = combined_df.iloc[:len(x),:]
x_test = combined_df.iloc[len(x):,:]


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#data split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

#parameters
xgb_param_dist = {
    'n_estimators': [300, 500, 700, 900],
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [4, 5, 6, 7, 8],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': np.linspace(0.6, 1.0, 5),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'reg_lambda': [0.5, 1, 1.5, 2]
}

lgb_param_dist = {
    'n_estimators': [500, 700, 900, 1100],
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'max_depth': [6, 8, 9, 10, -1],
    'num_leaves': [31, 63, 127, 255],
    'subsample': np.linspace(0.6, 1.0, 5),
    'colsample_bytree': np.linspace(0.6, 1.0, 5),
    'reg_lambda': [0.1, 0.5, 1.0, 1.5],
    'reg_alpha': [0.0, 0.1, 0.5, 1.0]
}

cat_param_dist = {
    'n_estimators': [500, 700, 900, 1100],
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7],
    'bagging_temperature': np.linspace(0.0, 1.0, 5),
    'border_count': [32, 64, 128, 254]
}

#Randomized Search Function
def tune_model(model, param_dist, name):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=30, 
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(x_train, y_train)
    print(f"\n Best Parameters for {name}:")
    print(random_search.best_params_)
    print(f"Best CV RMSE: {-random_search.best_score_:.4f}\n")
    return random_search.best_params_

#Tuning
best_xgb_params = tune_model(XGBRegressor(), xgb_param_dist, "XGBoost")
best_lgb_params = tune_model(LGBMRegressor(), lgb_param_dist, "LightGBM")
best_cat_params = tune_model(CatBoostRegressor(verbose=0), cat_param_dist, "CatBoost")

In [10]:
# XGBoost Model models
xgb_model = XGBRegressor(n_estimators=500, 
    learning_rate=0.03, 
    max_depth=6, 
    min_child_weight=5,
    subsample=0.7,
    gamma=0,
    reg_lambda=1.5,
    colsample_bytree = 0.9)

#LightGBM Model
lgb_model = LGBMRegressor(n_estimators=889, 
    learning_rate=0.016, 
    max_depth=9,
    num_leaves=130,
    min_child_weight=1,
    subsample=0.721,
    gamma=0,
    reg_lambda=1.11,
    reg_alpha=0.46,
    colsample_bytree = 0.99)

#CatBoost Regression
cat_model = CatBoostRegressor(n_estimators=1011,
                              learning_rate=0.053, 
                              depth=8, 
                              l2_leaf_reg=2.53,
                              bagging_temperature=0.144,
                              border_count=121
                               )

# Fit individual models
xgb_model.fit(x_train, y_train)
lgb_model.fit(x_train, y_train)
cat_model.fit(x_train, y_train)

# Predict on validation
xgb_pred = xgb_model.predict(x_val)
lgb_pred = lgb_model.predict(x_val)
cat_pred = cat_model.predict(x_val)

# Stack predictions as features for meta-model
stacked_features = np.column_stack((xgb_pred, lgb_pred, cat_pred))

# Train meta-model (Linear Regression)
meta_model = LinearRegression()
meta_model.fit(stacked_features, y_val)

# Make final ensemble predictions
final_pred = meta_model.predict(stacked_features)

# Evaluate
rmse_ensemble = np.sqrt(mean_squared_error(y_val, final_pred))
print(f"Stacked Ensemble Validation RMSE: {rmse_ensemble:.4f}")

In [None]:
joblib.dump(meta_model, "model.pkl")
loaded_model = joblib.load("model.pkl")
pred = loaded_model.predict(x_test)
print(pred)

In [14]:
submission_df = pd.DataFrame({
    'id':test_df['id'],
    'accident_risk': pred
})
file_path = r"D:\kaggle datasets\playground s5ep10\playground-series-s5e10\my_submission5.csv"
submission_df.to_csv(file_path,index=False)