In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/santhoshkumaroff/MachineLearningDatasets/refs/heads/main/dataset_traffic_accident_prediction1.csv")

df.head()

Unnamed: 0,Weather,Road_Type,Time_of_Day,Traffic_Density,Speed_Limit,Number_of_Vehicles,Driver_Alcohol,Accident_Severity,Road_Condition,Vehicle_Type,Driver_Age,Driver_Experience,Road_Light_Condition,Accident
0,Rainy,City Road,Morning,1.0,100.0,5.0,0.0,,Wet,Car,51.0,48.0,Artificial Light,0.0
1,Clear,Rural Road,Night,,120.0,3.0,0.0,Moderate,Wet,Truck,49.0,43.0,Artificial Light,0.0
2,Rainy,Highway,Evening,1.0,60.0,4.0,0.0,Low,Icy,Car,54.0,52.0,Artificial Light,0.0
3,Clear,City Road,Afternoon,2.0,60.0,3.0,0.0,Low,Under Construction,Bus,34.0,31.0,Daylight,0.0
4,Rainy,Highway,Morning,1.0,195.0,11.0,0.0,Low,Dry,Car,62.0,55.0,Artificial Light,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Weather               798 non-null    object 
 1   Road_Type             798 non-null    object 
 2   Time_of_Day           798 non-null    object 
 3   Traffic_Density       798 non-null    float64
 4   Speed_Limit           798 non-null    float64
 5   Number_of_Vehicles    798 non-null    float64
 6   Driver_Alcohol        798 non-null    float64
 7   Accident_Severity     798 non-null    object 
 8   Road_Condition        798 non-null    object 
 9   Vehicle_Type          798 non-null    object 
 10  Driver_Age            798 non-null    float64
 11  Driver_Experience     798 non-null    float64
 12  Road_Light_Condition  798 non-null    object 
 13  Accident              798 non-null    float64
dtypes: float64(7), object(7)
memory usage: 92.0+ KB


In [4]:
df.isnull().sum()

Weather                 42
Road_Type               42
Time_of_Day             42
Traffic_Density         42
Speed_Limit             42
Number_of_Vehicles      42
Driver_Alcohol          42
Accident_Severity       42
Road_Condition          42
Vehicle_Type            42
Driver_Age              42
Driver_Experience       42
Road_Light_Condition    42
Accident                42
dtype: int64

# there are null values in the target variable. so, we need to remove those null values

In [5]:
df = df[df["Accident"].notnull()]

In [6]:
df.isnull().sum()

Weather                 40
Road_Type               40
Time_of_Day             38
Traffic_Density         40
Speed_Limit             41
Number_of_Vehicles      38
Driver_Alcohol          40
Accident_Severity       42
Road_Condition          41
Vehicle_Type            39
Driver_Age              40
Driver_Experience       41
Road_Light_Condition    40
Accident                 0
dtype: int64

In [7]:
x = df.drop(columns=("Accident"),axis=1)

y = df["Accident"]

xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=42)

In [8]:
cat_columns = x.select_dtypes(include="object").columns

num_columns = x.select_dtypes(include=["int64","float64"]).columns

In [9]:
num_pipeline = Pipeline(
    steps=[
        ("num_impute",SimpleImputer(strategy="mean")),
        ("standscale",StandardScaler())
				]
)

cat_pipeline = Pipeline(
    steps=[
        ("cat_impute",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder(sparse_output=False))
				]
)


In [10]:
preprocessing = ColumnTransformer(
    transformers=[
        ("num_preprocess",num_pipeline,num_columns),
        ("cat_preprocess",cat_pipeline,cat_columns)
				], remainder="passthrough"
)

In [11]:
pipeline = Pipeline(
    steps=[
        ("preprocessing",preprocessing),
        ("decisiontree",DecisionTreeClassifier(random_state=42))
				]
)

In [12]:
pipeline.fit(xtrain,ytrain)

In [13]:
pipeline.score(xtest,ytest)

0.63125

In [14]:
pipeline.score(xtrain,ytrain)

1.0

# Bagging 


In [15]:
pipeline1 = Pipeline(
    steps=[
        ("preprocessing",preprocessing),
        ("decisiontree",RandomForestClassifier(random_state=42,n_estimators=100))
				]
)


In [16]:
pipeline1.fit(xtrain,ytrain)

In [17]:
pipeline1.score(xtest,ytest)

0.73125

In [18]:
pipeline1.score(xtrain,ytrain)

1.0

# Gridsearchcv

In [19]:
preprocessing = ColumnTransformer(
    transformers=[
        ("num_preprocess",num_pipeline,num_columns),
        ("cat_preprocess",cat_pipeline,cat_columns)
				], remainder="passthrough"
)

In [20]:
params = {
    "decisiontree__criterion" : ["entropy","gini"],
    'decisiontree__min_samples_split':[2,3,5,10],
    "decisiontree__max_depth" : [5,10,50,100,200],
    "decisiontree__min_samples_leaf" : [2,3,5,7,10]
}

gridsearch = GridSearchCV(pipeline1,params,cv=5,n_jobs=-1)

In [None]:
gridsearch.fit(xtrain,ytrain)

In [None]:
gridsearch.score(xtest,ytest)

0.70625

In [None]:
gridsearch.score(xtrain,ytrain)

0.7429467084639498

In [None]:
gridsearch.best_params_

{'decisiontree__criterion': 'entropy',
 'decisiontree__max_depth': 10,
 'decisiontree__min_samples_leaf': 3,
 'decisiontree__min_samples_split': 10}