In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [7]:
np.random.seed(42) 
N = 5000 
df = pd.DataFrame({ 
    "speed": np.random.normal(45, 10, N), 
    "driver_fatigue": np.random.uniform(0, 10, N), 
    "traffic_density": np.random.uniform(0, 1, N), 
    "visibility": np.random.normal(600, 200, N), 
    "brake_events": np.random.poisson(3, N), 
    "hour": np.random.randint(5, 18, N), 
    "road_type": np.random.choice(["highway", "city", "rural"], N), 
    "weather": np.random.choice( ["clear", "rain", "fog"], N, p=[0.6, 0.25, 0.15] ) })

In [9]:
df

Unnamed: 0,speed,driver_fatigue,traffic_density,visibility,brake_events,hour,road_type,weather
0,49.967142,1.682366,0.312656,345.357445,3,8,city,clear
1,43.617357,1.899347,0.178233,336.440050,7,7,rural,clear
2,51.476885,4.607122,0.859182,157.146363,2,14,city,clear
3,60.230299,2.862799,0.403523,763.506902,7,11,highway,clear
4,42.658466,2.474630,0.398544,765.531502,0,14,city,rain
...,...,...,...,...,...,...,...,...
4995,44.510350,1.402850,0.636944,641.363272,2,8,rural,fog
4996,52.114106,8.865641,0.746460,464.116088,4,8,city,clear
4997,76.129102,7.297726,0.927614,658.116070,2,16,rural,fog
4998,53.080362,1.646703,0.646860,816.316146,3,9,highway,rain


In [11]:
df["speed"] = df["speed"].clip(lower=10) 
df["visibility"] = df["visibility"].clip(lower=50)

In [13]:
risk_score = ( 
    0.03 * df["speed"] + 
    0.2 * df["driver_fatigue"] + 
    0.5 * df["traffic_density"] + 
    0.4 * (df["weather"] == "fog").astype(int) + 
    0.3 * (df["road_type"] == "highway").astype(int) + 
    0.2 * df["brake_events"] 
)

prob = 1 / (1 + np.exp(-risk_score))

threshold = np.percentile(prob, 70) 
df["accident_risk"] = (prob > threshold).astype(int)

In [15]:
print("\nTarget distribution:") 
print(df["accident_risk"].value_counts())


Target distribution:
accident_risk
0    3500
1    1500
Name: count, dtype: int64


In [19]:
df.to_csv("school_bus_accident_data.csv",index = False)

In [21]:
X = df.drop("accident_risk", axis=1)

In [23]:
X

Unnamed: 0,speed,driver_fatigue,traffic_density,visibility,brake_events,hour,road_type,weather
0,49.967142,1.682366,0.312656,345.357445,3,8,city,clear
1,43.617357,1.899347,0.178233,336.440050,7,7,rural,clear
2,51.476885,4.607122,0.859182,157.146363,2,14,city,clear
3,60.230299,2.862799,0.403523,763.506902,7,11,highway,clear
4,42.658466,2.474630,0.398544,765.531502,0,14,city,rain
...,...,...,...,...,...,...,...,...
4995,44.510350,1.402850,0.636944,641.363272,2,8,rural,fog
4996,52.114106,8.865641,0.746460,464.116088,4,8,city,clear
4997,76.129102,7.297726,0.927614,658.116070,2,16,rural,fog
4998,53.080362,1.646703,0.646860,816.316146,3,9,highway,rain


In [25]:
y = df["accident_risk"]

In [27]:
y

0       0
1       0
2       0
3       1
4       0
       ..
4995    0
4996    1
4997    1
4998    0
4999    0
Name: accident_risk, Length: 5000, dtype: int64

In [55]:
X_train, X_test, y_train, y_test =  train_test_split(
    X,
    y,
    test_size = 0.25,
    random_state=42,
    stratify=y
)

In [31]:
categorical_cols= ["road_type","weather"]
numeral_cols = X.select_dtypes(include=[np.number]).columns.tolist()

In [33]:
categorical_cols

['road_type', 'weather']

In [35]:
numeral_cols

['speed',
 'driver_fatigue',
 'traffic_density',
 'visibility',
 'brake_events',
 'hour']

In [51]:
preprocessor = ColumnTransformer(
    transformers =[
        ("num","passthrough",numeral_cols),
        ("cat", OneHotEncoder(drop="first",  # avoids dummy variable trap
         handle_unknown="ignore"),categorical_cols)
    ]
)

In [57]:
df

Unnamed: 0,speed,driver_fatigue,traffic_density,visibility,brake_events,hour,road_type,weather,accident_risk
0,49.967142,1.682366,0.312656,345.357445,3,8,city,clear,0
1,43.617357,1.899347,0.178233,336.440050,7,7,rural,clear,0
2,51.476885,4.607122,0.859182,157.146363,2,14,city,clear,0
3,60.230299,2.862799,0.403523,763.506902,7,11,highway,clear,1
4,42.658466,2.474630,0.398544,765.531502,0,14,city,rain,0
...,...,...,...,...,...,...,...,...,...
4995,44.510350,1.402850,0.636944,641.363272,2,8,rural,fog,0
4996,52.114106,8.865641,0.746460,464.116088,4,8,city,clear,1
4997,76.129102,7.297726,0.927614,658.116070,2,16,rural,fog,1
4998,53.080362,1.646703,0.646860,816.316146,3,9,highway,rain,0


In [59]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter = 1000, 
        solver ="lbfgs"),
    "Decision Tree": DecisionTreeClassifier(
        max_depth = 6,
        random_state=42),
    "Random Forests": RandomForestClassifier(
        n_estimators= 200,
        random_state=42,n_jobs=-1)
}

In [61]:
#Training

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocessing",preprocessor),
        ("model", model)
    ])

In [63]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [65]:
print(f"Model:{name}")
print(f"Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Model:Random Forests
Accuracy: 0.956
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       875
           1       0.95      0.90      0.92       375

    accuracy                           0.96      1250
   macro avg       0.96      0.94      0.95      1250
weighted avg       0.96      0.96      0.96      1250

