In [8]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# To load the dataset
# Treating 'na' as NaN (missing value)
df = pd.read_csv('RTA Dataset.csv', na_values='na')

# Check the shape and first few rows
print(f"Dataset Shape: {df.shape}")
df.head(10)

Dataset Shape: (12316, 32)


Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,,,,,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,,,,,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3.0,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3.0,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,,,,,,,Not a Pedestrian,Overtaking,Slight Injury
5,14:15:00,Friday,31-50,Male,,Unknown,,,,,...,U-Turn,Driver or rider,Male,31-50,3.0,Driver,Normal,Not a Pedestrian,Overloading,Slight Injury
6,17:30:00,Wednesday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Owner,,...,Moving Backward,Driver or rider,Female,18-30,3.0,Driver,Normal,Not a Pedestrian,Other,Slight Injury
7,17:20:00,Friday,18-30,Male,Junior high school,Employee,2-5yr,Automobile,Governmental,Above 10yr,...,U-Turn,,,,,,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
8,17:20:00,Friday,18-30,Male,Junior high school,Employee,Above 10yr,Lorry (41?100Q),Owner,1-2yr,...,Going straight,Pedestrian,Male,Under 18,3.0,Driver,Normal,Crossing from driver's nearside,Changing lane to the right,Slight Injury
9,17:20:00,Friday,18-30,Male,Junior high school,Employee,1-2yr,Automobile,Owner,2-5yrs,...,U-Turn,Passenger,Male,18-30,3.0,Driver,Normal,Not a Pedestrian,Moving Backward,Serious Injury


In [16]:
# Define columns with many missing values (>1000)
cols_high_missing = [
    'Service_year_of_vehicle', 'Defect_of_vehicle', 'Casualty_class',
    'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
    'Work_of_casuality', 'Fitness_of_casuality'
]

# Define columns with few missing values
cols_low_missing = [
    'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
    'Type_of_vehicle', 'Owner_of_vehicle', 'Area_accident_occured',
    'Lanes_or_Medians', 'Road_allignment', 'Types_of_Junction',
    'Road_surface_type', 'Type_of_collision', 'Vehicle_movement'
]

# Fill high missing value with 'Unkown'
for col in cols_high_missing + cols_low_missing:
    df[col] = df[col].fillna('Unknown')
    
# Fill low missing value with mode
for col in cols_low_missing:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)
    
# Verify cleaning
print(f"Missing value remaining: {df.isnull().sum().sum()}")

Missing value remaining: 0


In [17]:
# Convert Time to datetime object
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')

# Extract hour (0-23)
df['Hour'] = df['Time'].dt.hour

# Drop original Time column as we now have the numeric hour
# df = df.drop(columns=['Time']) # optional

In [18]:
# Save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_rta_data.csv', index=False)