# Road Accident Prediction

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [36]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [37]:
train

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.30
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,517749,highway,4,0.10,70,daylight,foggy,True,True,afternoon,False,False,2,0.32
517750,517750,rural,4,0.47,35,daylight,rainy,True,True,morning,False,False,1,0.26
517751,517751,urban,4,0.62,25,daylight,foggy,False,False,afternoon,False,True,0,0.19
517752,517752,highway,3,0.63,25,night,clear,True,False,afternoon,True,True,3,0.51


In [38]:
x_train = train[['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']].copy()
y_train = train['accident_risk']

# checking for any 'NaN' value
print(x_train.isna().sum())
print(y_train.isna().sum())
print("*************************************************************")

# checking for shape of data
print(x_train.shape)
print(y_train.shape)
print("*************************************************************")

# checking for data types in each column 
print(x_train.info())
print("*************************************************************")

# assigning the label encoders to the categorical string data
# this will assign numerical data to the categorical text data which is important for LinearRegression
le = LabelEncoder()
x_train['road_type'] = le.fit_transform(x_train['road_type'])
x_train['lighting'] = le.fit_transform(x_train['lighting'])
x_train['weather'] = le.fit_transform(x_train['weather'])
x_train['road_signs_present'] = le.fit_transform(x_train['road_signs_present'])
x_train['public_road'] = le.fit_transform(x_train['public_road'])
x_train['time_of_day'] = le.fit_transform(x_train['time_of_day'])
x_train['holiday'] = le.fit_transform(x_train['holiday'])
x_train['school_season'] = le.fit_transform(x_train['school_season'])





road_type             0
num_lanes             0
curvature             0
speed_limit           0
lighting              0
weather               0
road_signs_present    0
public_road           0
time_of_day           0
holiday               0
school_season         0
dtype: int64
0
*************************************************************
(517754, 11)
(517754,)
*************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   road_type           517754 non-null  object 
 1   num_lanes           517754 non-null  int64  
 2   curvature           517754 non-null  float64
 3   speed_limit         517754 non-null  int64  
 4   lighting            517754 non-null  object 
 5   weather             517754 non-null  object 
 6   road_signs_present  517754 non-null  bool   
 7   public_r

In [41]:
x_test = test[['road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season']].copy()


x_test['road_type'] = le.fit_transform(x_test['road_type'])
x_test['lighting'] = le.fit_transform(x_test['lighting'])
x_test['weather'] = le.fit_transform(x_test['weather'])
x_test['road_signs_present'] = le.fit_transform(x_test['road_signs_present'])
x_test['public_road'] = le.fit_transform(x_test['public_road'])
x_test['time_of_day'] = le.fit_transform(x_test['time_of_day'])
x_test['holiday'] = le.fit_transform(x_test['holiday'])
x_test['school_season'] = le.fit_transform(x_test['school_season'])



In [54]:
reg = LinearRegression()
reg.fit(x_train, y_train)

y_train_pred = reg.predict(x_train)

# printing the accuracy of the data using r2_score (used for linear regression)
print(f"SCORE : {np.round(r2_score(y_train, y_train_pred), 2)*100} %")

pd.DataFrame({
    'ID':train['id'],
    'ORIGINAL ACCIDENT RISK': y_train,
    'PREDICTED ACCIDENT RISK': np.round(y_train_pred, 2)
})





SCORE : 71.0 %


Unnamed: 0,ID,ORIGINAL ACCIDENT RISK,PREDICTED ACCIDENT RISK
0,0,0.13,0.11
1,1,0.35,0.34
2,2,0.30,0.48
3,3,0.21,0.21
4,4,0.56,0.37
...,...,...,...
517749,517749,0.32,0.26
517750,517750,0.26,0.25
517751,517751,0.19,0.21
517752,517752,0.51,0.35
