<a href="https://colab.research.google.com/github/DineshY1011/US_Accident/blob/main/Milestone_2/Week_4/Day_19/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

# Load the sampled dataset
df = pd.read_csv("/content/drive/MyDrive/US_Dataset/US_Accidents_sampled_1M.csv")

In [3]:
# 1. Parse datetimes
df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")
df["End_Time"]   = pd.to_datetime(df["End_Time"], errors="coerce")

In [4]:
# 2. Drop duplicates and rows with invalid times
df = df.drop_duplicates(subset="ID")
df = df.dropna(subset=["Start_Time", "End_Time"])

In [5]:
# 3. Drop rows missing critical location data
df = df.dropna(subset=["Start_Lat", "Start_Lng"])

In [6]:
# 4. Compute incident duration in minutes
df["Duration_Minutes"] = (df["End_Time"] - df["Start_Time"]).dt.total_seconds() / 60

In [7]:
# 5. Extract temporal features
df["Hour"]        = df["Start_Time"].dt.hour
df["DayOfWeek"]   = df["Start_Time"].dt.weekday
df["Month"]       = df["Start_Time"].dt.month
df["IsWeekend"]   = df["DayOfWeek"].isin([5,6]).astype(int)

In [8]:
# 6. Encode boolean traffic feature flags as integers
bool_cols = [
    "Roundabout",
    "Station",
    "Stop",
    "Traffic_Calming",
    "Traffic_Signal",
    "Turning_Loop"
]
for col in bool_cols:
    df[col] = df[col].astype(int)

In [9]:
# 7. Encode light condition as binary day/night
df["IsDay"] = (df["Sunrise_Sunset"] == "Day").astype(int)

In [10]:
# 8. Drop columns not used for modeling
drop_cols = [
    "ID",
    "Source",
    "Description",
    "Street",
    "Start_Time",
    "End_Time",
    "Sunrise_Sunset",
    "Civil_Twilight",
    "Nautical_Twilight",
    "Astronomical_Twilight"
]
df = df.drop(columns=drop_cols)

In [11]:
# 9. Handle missing values in numeric columns
#    Fill numeric NaNs with median
num_cols = df.select_dtypes(include="number").columns.tolist()
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [12]:
# 10. Final clean-up: remove any remaining rows with NaNs
df = df.dropna()

In [13]:
df['Severity'].value_counts()

Unnamed: 0_level_0,count
Severity,Unnamed: 1_level_1
2,683833
3,163944
4,22893
1,8603


In [14]:
# 11. stratify/drop rare severity classes if needed
# e.g., keep only severity levels 1-4
df = df[df["Severity"].isin([1,2,3,4])]

# Save cleaned dataset
df.to_csv("accidents_cleaned.csv", index=False)

In [15]:
print("Cleaned dataset shape:", df.shape)
df.head()

Cleaned dataset shape: (879273, 42)


Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),City,County,State,Zipcode,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Duration_Minutes,Hour,DayOfWeek,Month,IsWeekend,IsDay
0,1,26.7069,-80.11936,26.7069,-80.11936,0.0,West Palm Beach,Palm Beach,FL,33417-4638,...,0,0,1,0,60.0,9,4,4,0,1
2,3,33.985249,-84.269348,36.232482,-89.665492,0.0,Alpharetta,Fulton,GA,30022,...,0,0,0,0,30.0,16,4,8,0,1
3,3,47.118706,-122.556908,36.232482,-89.665492,0.0,Tacoma,Pierce,WA,98433,...,0,0,0,0,33.733333,15,4,9,0,1
4,2,33.451355,-111.890343,36.232482,-89.665492,0.0,Scottsdale,Maricopa,AZ,85256,...,0,0,0,0,76.433333,16,0,6,0,1
5,2,42.44891,-93.721138,36.232482,-89.665492,7.77,Webster City,Hamilton,IA,50595,...,0,0,0,0,242.9,12,3,2,0,1
