<a href="https://colab.research.google.com/github/Althaf0097/Road_Accident_Prediction/blob/main/Road_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [156]:
!pip install -U imbalanced-learn # Installs the imblearn library which contains SMOTE




# Road Accident Severity in India

## Load the dataset

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier


## Data Cleaning


In [158]:
data = pd.read_csv('Road.csv')

In [159]:
data.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,01:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,01:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Time                         12316 non-null  object
 1   Day_of_week                  12316 non-null  object
 2   Age_band_of_driver           12316 non-null  object
 3   Sex_of_driver                12316 non-null  object
 4   Educational_level            11575 non-null  object
 5   Vehicle_driver_relation      11737 non-null  object
 6   Driving_experience           11487 non-null  object
 7   Type_of_vehicle              11366 non-null  object
 8   Owner_of_vehicle             11834 non-null  object
 9   Service_year_of_vehicle      8388 non-null   object
 10  Defect_of_vehicle            7889 non-null   object
 11  Area_accident_occured        12077 non-null  object
 12  Lanes_or_Medians             11931 non-null  object
 13  Road_allignment              12

In [161]:
data.describe()

Unnamed: 0,Number_of_vehicles_involved,Number_of_casualties
count,12316.0,12316.0
mean,2.040679,1.548149
std,0.68879,1.007179
min,1.0,1.0
25%,2.0,1.0
50%,2.0,1.0
75%,2.0,2.0
max,7.0,8.0


In [162]:
data.shape

(12316, 32)

In [163]:
# Step 1: Replace 'na' strings with NaN for proper handling
data.replace('na', np.nan, inplace=True)

In [164]:
data.isna().sum()

Unnamed: 0,0
Time,0
Day_of_week,0
Age_band_of_driver,0
Sex_of_driver,0
Educational_level,741
Vehicle_driver_relation,579
Driving_experience,829
Type_of_vehicle,950
Owner_of_vehicle,482
Service_year_of_vehicle,3928


In [165]:
driving_experience_levels = [
    '1-2yr',
    'Above 10yr',
    '5-10yr',
    '2-5yr',        # Keep NaN as a string to indicate unknown
    'No Licence',
    'Below 1yr',
]

# Randomly choose values for missing entries in Driving_experience
missing_mask_experience = data['Driving_experience'].isnull()
data.loc[missing_mask_experience, 'Driving_experience'] = np.random.choice(
    driving_experience_levels,
    size=missing_mask_experience.sum()
)

In [166]:
# Step 10: Randomly fill missing values in the Area_accident_occured column
area_accident_levels = [
    'Residential areas',
    'Office areas',
    'Recreational areas',
    'Industrial areas',
    'Other',
    'Church areas',
    'Market areas',
    'Unknown',
    'Rural village areas',
    'Outside rural areas',
    'Hospital areas',
    'School areas'
]

if data['Area_accident_occured'].isnull().any():
    missing_mask_area = data['Area_accident_occured'].isnull()
    data.loc[missing_mask_area, 'Area_accident_occured'] = np.random.choice(
        area_accident_levels,
        size=missing_mask_area.sum()
    )


In [167]:
# Step 12: Randomly fill missing values in the Road_allignment column
road_allignment_levels = [
    'Tangent road with flat terrain',
    'Tangent road with mild grade and flat terrain',
    'Escarpments',
    'Tangent road with rolling terrain',
    'Gentle horizontal curve',
    'Tangent road with mountainous terrain and',
    'Steep grade downward with mountainous terrain',
    'Sharp reverse curve',
    'Steep grade upward with mountainous terrain'
]

if data['Road_allignment'].isnull().any():
    missing_mask_alignment = data['Road_allignment'].isnull()
    data.loc[missing_mask_alignment, 'Road_allignment'] = np.random.choice(
        road_allignment_levels,
        size=missing_mask_alignment.sum()
    )

In [168]:
# Step 6: Randomly fill missing values in the Type_of_vehicle column
type_of_vehicle_levels = [
    'Automobile',
    'Public (> 45 seats)',
    'Lorry (41-100Q)',
    'Public (13-45 seats)',
    'Lorry (11-40Q)',
    'Long lorry',
    'Public (12 seats)',
    'Taxi',
    'Pick up up to 10Q',
    'Stationwagen',
    'Ridden horse',
    'Other',
    'Bajaj',
    'Turbo',
    'Motorcycle',
    'Special vehicle',
    'Bicycle'
]

if data['Type_of_vehicle'].isnull().any():
    missing_mask_vehicle = data['Type_of_vehicle'].isnull()
    data.loc[missing_mask_vehicle, 'Type_of_vehicle'] = np.random.choice(
        type_of_vehicle_levels,
        size=missing_mask_vehicle.sum()
    )

In [169]:
# Step 14: Randomly fill missing values in the Vehicle_movement column
vehicle_movement_levels = [
    'Going straight',
    'U-Turn',
    'Moving Backward',
    'Turnover',
    'Waiting to go',
    'Getting off',
    'Reversing',
    'Parked',
    'Stopping',
    'Overtaking',
    'Other',
    'Entering a junction'
]

missing_mask_movement = data['Vehicle_movement'].isnull()
data.loc[missing_mask_movement, 'Vehicle_movement'] = np.random.choice(
    vehicle_movement_levels,
    size=missing_mask_movement.sum()
)


In [170]:
selected_columns = [
    'Type_of_vehicle',
    'Sex_of_driver',
    'Age_band_of_driver',
    'Driving_experience',
    'Area_accident_occured',
    'Road_allignment',
    'Weather_conditions',
    'Light_conditions',
    'Cause_of_accident',
    'Vehicle_movement',
    'Accident_severity'  # This is the target variable
]

# Filter the dataset
clean_data = data[selected_columns]
# Display the first few rows of the filtered dataset to verify
pd.DataFrame(clean_data.head())

Unnamed: 0,Type_of_vehicle,Sex_of_driver,Age_band_of_driver,Driving_experience,Area_accident_occured,Road_allignment,Weather_conditions,Light_conditions,Cause_of_accident,Vehicle_movement,Accident_severity
0,Automobile,Male,18-30,1-2yr,Residential areas,Tangent road with flat terrain,Normal,Daylight,Moving Backward,Going straight,Slight Injury
1,Public (> 45 seats),Male,31-50,Above 10yr,Office areas,Tangent road with flat terrain,Normal,Daylight,Overtaking,Going straight,Slight Injury
2,Lorry (41?100Q),Male,18-30,1-2yr,Recreational areas,Escarpments,Normal,Daylight,Changing lane to the left,Going straight,Serious Injury
3,Public (> 45 seats),Male,18-30,5-10yr,Office areas,Tangent road with mild grade and flat terrain,Normal,Darkness - lights lit,Changing lane to the right,Going straight,Slight Injury
4,Pick up up to 10Q,Male,18-30,2-5yr,Industrial areas,Tangent road with flat terrain,Normal,Darkness - lights lit,Overtaking,Going straight,Slight Injury


In [171]:
clean_data.isna().sum()

Unnamed: 0,0
Type_of_vehicle,0
Sex_of_driver,0
Age_band_of_driver,0
Driving_experience,0
Area_accident_occured,0
Road_allignment,0
Weather_conditions,0
Light_conditions,0
Cause_of_accident,0
Vehicle_movement,0


In [172]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Type_of_vehicle        12316 non-null  object
 1   Sex_of_driver          12316 non-null  object
 2   Age_band_of_driver     12316 non-null  object
 3   Driving_experience     12316 non-null  object
 4   Area_accident_occured  12316 non-null  object
 5   Road_allignment        12316 non-null  object
 6   Weather_conditions     12316 non-null  object
 7   Light_conditions       12316 non-null  object
 8   Cause_of_accident      12316 non-null  object
 9   Vehicle_movement       12316 non-null  object
 10  Accident_severity      12316 non-null  object
dtypes: object(11)
memory usage: 1.0+ MB


In [173]:
# Optionally, save the filtered dataset to a new CSV file
clean_data.to_csv('Road_accidents.csv', index=False)