In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e10/sample_submission.csv
/kaggle/input/playground-series-s5e10/train.csv
/kaggle/input/playground-series-s5e10/test.csv


In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    HuberRegressor, SGDRegressor, BayesianRidge,
    TheilSenRegressor, RANSACRegressor, QuantileRegressor, PoissonRegressor
)
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error



In [3]:

df=pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')

In [4]:
df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [5]:
df.describe()

Unnamed: 0,id,num_lanes,curvature,speed_limit,num_reported_accidents,accident_risk
count,517754.0,517754.0,517754.0,517754.0,517754.0,517754.0
mean,258876.5,2.491511,0.488719,46.112575,1.18797,0.352377
std,149462.849974,1.120434,0.272563,15.788521,0.895961,0.166417
min,0.0,1.0,0.0,25.0,0.0,0.0
25%,129438.25,1.0,0.26,35.0,1.0,0.23
50%,258876.5,2.0,0.51,45.0,1.0,0.34
75%,388314.75,3.0,0.71,60.0,2.0,0.46
max,517753.0,4.0,1.0,70.0,7.0,1.0


In [6]:
df.nunique()

id                        517754
road_type                      3
num_lanes                      4
curvature                    261
speed_limit                    5
lighting                       3
weather                        3
road_signs_present             2
public_road                    2
time_of_day                    3
holiday                        2
school_season                  2
num_reported_accidents         8
accident_risk                 98
dtype: int64

In [7]:
df.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [8]:
df.drop(columns=['id'],inplace=True)

In [9]:
x=df.drop(columns=['accident_risk'])

In [10]:
y=df['accident_risk']

In [11]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [12]:

num_cols=X_train.select_dtypes(include=['number']).columns.tolist()
cat_cols=X_train.select_dtypes(exclude=['number']).columns.tolist()

In [13]:
print("X_train columns:", list(X_train.columns))
print("num_cols:", num_cols, type(num_cols))
print("cat_cols:", cat_cols, type(cat_cols))

# Check membership
print("Missing in X_train (num):", [c for c in num_cols if c not in X_train.columns])
print("Missing in X_train (cat):", [c for c in cat_cols if c not in X_train.columns])


X_train columns: ['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season', 'num_reported_accidents']
num_cols: ['num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents'] <class 'list'>
cat_cols: ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season'] <class 'list'>
Missing in X_train (num): []
Missing in X_train (cat): []


In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])


In [15]:
models = {
   "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net": ElasticNet(),
    "Huber Regressor": HuberRegressor()
}

In [16]:
for name,model in models.items():
    
    pipeline=Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('regressor',model)
    ])
    pipeline.fit(X_train,y_train)
    y_pred=pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'name {name} rmse : {rmse} ,mae {mae} ')
        

name Linear Regression rmse : 0.005416952292735198 ,mae 0.058355102458901374 
name Ridge Regression rmse : 0.005416944496029028 ,mae 0.05835564054202117 
name Lasso Regression rmse : 0.027583939628372814 ,mae 0.1325875790196913 
name Elastic Net rmse : 0.027583939628372814 ,mae 0.1325875790196913 
name Huber Regressor rmse : 0.005418674439085877 ,mae 0.05834269220512417 


In [17]:
best_model = HuberRegressor()
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', best_model)
])
pipe.fit(x,y)

In [18]:
pipe.fit(x,y)
y_pred=pipe.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(rmse)

0.07360926633508137


In [19]:
test=pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')

In [20]:
test_pred=pipe.predict(test)
accident_risk_submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': test_pred
})
accident_risk_submission.to_csv('accident_risk_submission.csv', index=False)
accident_risk_submission.to_csv("accident_risk_submission.csv", index=False)
print("✅ accident_risk_submission.csv created successfully!")
accident_risk_submission.head()

✅ accident_risk_submission.csv created successfully!


Unnamed: 0,id,accident_risk
0,517754,0.366538
1,517755,0.159949
2,517756,0.208675
3,517757,0.434214
4,517758,0.326275
