In [1]:
# notebooks/02_data_cleaning.ipynb
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [3]:
# =======================
# 1Ô∏è‚É£ Load Dataset
# =======================
DATA_PATH = "C:/Users/Anupam/Desktop/traffic_prediction_project/data_raw/US_Accidents_March23.csv"

print("Loading dataset...")
df = pd.read_csv(DATA_PATH, nrows=200000)  # sample to avoid memory issues
print("Shape:", df.shape)

Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Anupam/Desktop/traffic_prediction_project/data_raw/US_Accidents_March23.csv'

In [None]:
# =======================
# 2Ô∏è‚É£ Select Columns
# =======================
cols_to_keep = [
    'Severity', 'Start_Time', 'Start_Lat', 'Start_Lng', 'Distance(mi)',
    'City', 'State', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
    'Visibility(mi)', 'Wind_Speed(mph)', 'Weather_Condition',
    'Sunrise_Sunset', 'Traffic_Signal'
]
df = df[cols_to_keep]
print("Columns retained:", len(df.columns))

In [None]:
# =======================
# 3Ô∏è‚É£ Handle Missing Values
# =======================
df.dropna(thresh=int(len(df.columns) * 0.5), inplace=True)

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

In [None]:
# =======================
# 4Ô∏è‚É£ Extract Time Features
# =======================
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Hour'] = df['Start_Time'].dt.hour
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year
df.drop(columns=['Start_Time'], inplace=True)

In [None]:
# =======================
# 5Ô∏è‚É£ Encode Categorical Columns (Separate Encoder for Each)
# =======================
cat_cols = ['City', 'State', 'Weather_Condition', 'Sunrise_Sunset', 'Traffic_Signal']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# ‚úÖ Save all label encoders
os.makedirs("C:/Users/Anupam/Desktop/traffic_prediction_project/models", exist_ok=True)
encoders_path = "C:/Users/Anupam/Desktop/traffic_prediction_project/models/label_encoders.pkl"
joblib.dump(label_encoders, encoders_path)
print(f"‚úÖ Label encoders saved to: {encoders_path}")

In [None]:
# =======================
# 6Ô∏è‚É£ Split Train & Test
# =======================
X = df.drop('Severity', axis=1)
y = df['Severity']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# =======================
# 7Ô∏è‚É£ Scale Numerical Columns
# =======================
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=[np.number]).columns

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
# ‚úÖ Save scaler too (for later use if needed)
scaler_path = "C:/Users/Anupam/Desktop/traffic_prediction_project/models/scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"‚úÖ Scaler saved to: {scaler_path}")

In [None]:
# =======================
# 8Ô∏è‚É£ Save Processed Data
# =======================
os.makedirs("C:/Users/Anupam/Desktop/traffic_prediction_project/data_processed", exist_ok=True)
train_path = "C:/Users/Anupam/Desktop/traffic_prediction_project/data_processed/train_data.csv"
test_path = "C:/Users/Anupam/Desktop/traffic_prediction_project/data_processed/test_data.csv"

pd.concat([X_train, y_train], axis=1).to_csv(train_path, index=False)
pd.concat([X_test, y_test], axis=1).to_csv(test_path, index=False)

print(f"‚úÖ Processed training data saved to: {train_path}")
print(f"‚úÖ Processed testing data saved to: {test_path}")
print("üéâ Data cleaning & preprocessing complete successfully!")