In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings("ignore")

# **Pre_Processings**

In [3]:
df = pd.read_csv("train.csv")

In [4]:
display(df.head())

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [5]:
display(df.isnull().sum())

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [6]:
df_cleaned = df.dropna()  # Drop rows with any NaN values

X = df_cleaned.drop(columns=['accident_risk', 'id'])  # Drop target and ID
y = df_cleaned['accident_risk']

In [7]:
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


In [9]:
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# **Linear Regression**

In [11]:
lr = LinearRegression()

In [12]:
lr.fit(X_train, y_train)

In [13]:
display(X_train.isnull().sum())

num_lanes                  0
curvature                  0
speed_limit                0
num_reported_accidents     0
road_type_rural            0
road_type_urban            0
lighting_dim               0
lighting_night             0
weather_foggy              0
weather_rainy              0
road_signs_present_True    0
public_road_True           0
time_of_day_evening        0
time_of_day_morning        0
holiday_True               0
school_season_True         0
dtype: int64

In [14]:
y_pred_LR = lr.predict(X_test)

In [15]:
# Find the index of the row with NaN in X_train
nan_index = X_train[X_train['num_reported_accidents'].isnull()].index

# Drop the row with the identified index from both X_train and y_train
X_train = X_train.drop(nan_index)
y_train = y_train.drop(nan_index)

display(X_train.isnull().sum()) # Check again for NaN values

num_lanes                  0
curvature                  0
speed_limit                0
num_reported_accidents     0
road_type_rural            0
road_type_urban            0
lighting_dim               0
lighting_night             0
weather_foggy              0
weather_rainy              0
road_signs_present_True    0
public_road_True           0
time_of_day_evening        0
time_of_day_morning        0
holiday_True               0
school_season_True         0
dtype: int64

In [16]:
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_LR))
mae_lr = mean_absolute_error(y_test, y_pred_LR)
r2_lr = r2_score(y_test, y_pred_LR)
print("RMSE:", rmse_lr)
print("MAE:", mae_lr)
print("R² Score:", r2_lr)
accuracy_lr = r2_lr * 100
print("Accuracy (based on R²):", accuracy_lr, "%")

RMSE: 0.07353086258983135
MAE: 0.05831216757325724
R² Score: 0.8041884067087808
Accuracy (based on R²): 80.41884067087808 %


# **Polynomial Linear Regression**

In [17]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [18]:
poly_lr = LinearRegression()
poly_lr.fit(X_train_poly, y_train)

In [19]:
y_pred_poly = poly_lr.predict(X_test_poly)


In [20]:
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))
mae_poly = mean_absolute_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
accuracy_poly=r2_poly*100

print("Polynomial Regression Performance:")
print("RMSE:", rmse_poly)
print("MAE:", mae_poly)
print("R² Score:", r2_poly)
print("Accuracy (based on R²):", r2_poly * 100, "%")

Polynomial Regression Performance:
RMSE: 0.06782803283052195
MAE: 0.05302896149922269
R² Score: 0.8333836857157012
Accuracy (based on R²): 83.33836857157011 %


# **Decission Tree**

In [21]:
dt = DecisionTreeRegressor(max_depth=15, random_state=42)
dt.fit(X_train, y_train)

In [22]:
y_pred_DT = dt.predict(X_test)

In [23]:
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_DT))
mae_dt = mean_absolute_error(y_test, y_pred_DT)
r2_dt = r2_score(y_test, y_pred_DT)
accuracy_dt = r2_dt * 100

print("Decision Tree Results:")
print("RMSE:", rmse_dt)
print("MAE:", mae_dt)
print("R² Score:", r2_dt)
print("Accuracy (based on R²):", accuracy_dt, "%")

Decision Tree Results:
RMSE: 0.060835961005594213
MAE: 0.046707852132296473
R² Score: 0.8659645040876682
Accuracy (based on R²): 86.59645040876683 %


# **KNN**

In [24]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)


In [25]:
y_pred_KNN = knn.predict(X_test)

In [26]:
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_KNN))
mae_knn = mean_absolute_error(y_test, y_pred_KNN)
r2_knn = r2_score(y_test, y_pred_KNN)
accuracy_knn = r2_knn * 100

print("KNN Results:")
print("RMSE:", rmse_knn)
print("MAE:", mae_knn)
print("R² Score:", r2_knn)
print("Accuracy (based on R²):", accuracy_knn, "%")

KNN Results:
RMSE: 0.06631788144759125
MAE: 0.05159722262460043
R² Score: 0.8407203224212387
Accuracy (based on R²): 84.07203224212387 %


# **RandomForest**

In [27]:

rf = RandomForestRegressor(n_estimators=40, max_depth=15, random_state=42)
rf.fit(X_train, y_train)



In [28]:
y_pred_RF  = rf.predict(X_test)

In [29]:
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_RF))
mae_rf = mean_absolute_error(y_test, y_pred_RF)
r2_rf = r2_score(y_test, y_pred_RF)
print("RMSE:", rmse_rf)
print("MAE:", mae_rf)
print("R² Score:", r2_rf)
accuracy_rf = r2_rf* 100
print("Accuracy (based on R²):", accuracy_rf, "%")

RMSE: 0.057056860201704626
MAE: 0.044220504651292646
R² Score: 0.8820997250385653
Accuracy (based on R²): 88.20997250385652 %


# **Gradient Boosting**

In [30]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=100,    # fewer trees for faster training
    max_depth=6,         # limit depth to prevent overfitting and speed up
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

In [31]:
y_pred_GB = xgb.predict(X_test)

In [32]:
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_GB))
mae_gb = mean_absolute_error(y_test, y_pred_GB)
r2_gb = r2_score(y_test, y_pred_GB)
print("RMSE:", rmse_gb)
print("MAE:", mae_gb)
print("R² Score:", r2_gb)
accuracy_gb = r2_gb* 100
print("Accuracy (based on R²):", accuracy_gb, "%")

RMSE: 0.056362738447171296
MAE: 0.04376499989518766
R² Score: 0.8849508932600546
Accuracy (based on R²): 88.49508932600546 %


In [33]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


# **CatBoost**

In [34]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(
    iterations=100,
    depth=6,
    learning_rate=0.1,
    random_seed=42,
    verbose=0
)


cat.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x15461b8f0a0>

In [35]:

y_pred_cat = cat.predict(X_test)

In [36]:
rmse_cat = np.sqrt(mean_squared_error(y_test, y_pred_cat))
mae_cat = mean_absolute_error(y_test, y_pred_cat)
r2_cat = r2_score(y_test, y_pred_cat)
print("RMSE:", rmse_cat)
print("MAE:", mae_cat)
print("R² Score:", r2_cat)
accuracy_cat = r2_cat* 100
print("Accuracy (based on R²):", accuracy_cat, "%")

RMSE: 0.05666542388133035
MAE: 0.04409528681293341
R² Score: 0.8837118762497134
Accuracy (based on R²): 88.37118762497134 %


# **LGBM**

In [37]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_train, y_train)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 197
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 16
[LightGBM] [Info] Start training from score 0.352605


In [38]:
y_pred_lgbm = lgbm.predict(X_test)

In [39]:
rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)
accuracy_lgbm = r2_lgbm* 100

print("RMSE:", rmse_lgbm)
print("MAE:", mae_lgbm)
print("R² Score:", r2_lgbm)

RMSE: 0.05649561755587781
MAE: 0.04393208634382049
R² Score: 0.8844077811258976


# **Comparision**

In [40]:
data = {
    'Algorithm': [
        'Linear Regression',
        'Polynomail Regression',
        'Decision Tree',
        'KNN',
        'Random Forest',
        'Gradient Boosting',
        'CatBoost',
        'LGBM'
    ],
    'RMSE': [
        rmse_lr,
        rmse_poly,
        rmse_dt,
        rmse_knn,
        rmse_rf,
        rmse_gb,
        rmse_cat,
        rmse_lgbm
    ],
    'MAE': [
        mae_lr,
        mae_poly,
        mae_dt,
        mae_knn,
        mae_rf,
        mae_gb,
        mae_cat,
        mae_lgbm
    ],
    'R²': [
        r2_lr,
        r2_poly,
        r2_dt,
        r2_knn,
        r2_rf,
        r2_gb,
        r2_cat,
        r2_lgbm
    ],
    'Accuracy (%)': [
        accuracy_lr,
        accuracy_poly,
        accuracy_dt,
        accuracy_knn,
        accuracy_rf,
        accuracy_gb,
        accuracy_cat,
        accuracy_lgbm
    ]
}

comparison_df = pd.DataFrame(data)

numeric_cols = ['RMSE', 'MAE', 'R²', 'Accuracy (%)']
comparison_df[numeric_cols] = comparison_df[numeric_cols].round(3)

display(comparison_df)


Unnamed: 0,Algorithm,RMSE,MAE,R²,Accuracy (%)
0,Linear Regression,0.074,0.058,0.804,80.419
1,Polynomail Regression,0.068,0.053,0.833,83.338
2,Decision Tree,0.061,0.047,0.866,86.596
3,KNN,0.066,0.052,0.841,84.072
4,Random Forest,0.057,0.044,0.882,88.21
5,Gradient Boosting,0.056,0.044,0.885,88.495
6,CatBoost,0.057,0.044,0.884,88.371
7,LGBM,0.056,0.044,0.884,88.441


# **PKL File Pipeline..**

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor
import pickle
import pandas as pd

In [42]:
# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Final pipeline: preprocessing + model
model_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', CatBoostRegressor(
        iterations=100,
        depth=6,
        learning_rate=0.1,
        random_seed=42,
        verbose=0
    ))
])


In [43]:
model_pipe.fit(X_train, y_train)


In [44]:
with open("model.pkl", "wb") as f:
    pickle.dump(model_pipe, f)
