<a href="https://colab.research.google.com/github/FishyDanny/Road-Safety-Risk-Prediction/blob/main/01_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [None]:
# Download data
url = "https://aueprod01ckanstg.blob.core.windows.net/public-catalogue/1633969d-46d3-437f-82e5-4d468db04a9f/bitre_fatalities_may2025.xlsx"
data = pd.read_excel(url, header=4)

# Make files and save raw dataset as csv
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/ACTL3143_project/', exist_ok=True)
os.makedirs('/content/drive/MyDrive/ACTL3143_project/processed', exist_ok=True)

data_csv = data.to_csv(index=False, header=True)
joblib.dump(data_csv, '/content/drive/MyDrive/ACTL3143_project/data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['/content/drive/MyDrive/ACTL3143_project/data.csv']

In [None]:
# Translate the unknown indicators to nan
unknown_indicators = ["Unknown", "unknown", "-9"]
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].replace(unknown_indicators, np.nan)

# Assuming that unknown data is 'Missing' response
data['Bus Involvement'] = data['Bus Involvement'].fillna('Missing')
data['Articulated Truck Involvement'] = data['Articulated Truck Involvement'].fillna('Missing')
data['Heavy Rigid Truck Involvement'] = data['Heavy Rigid Truck Involvement'].fillna('Missing')
data['National Road Type'] = data['National Road Type'].fillna('Missing')
data['Road User'] = data['Road User'].fillna('Missing')
data['Gender'] = data['Gender'].fillna('Missing')
data['Time'] = pd.to_datetime(data['Time'], errors='coerce').dt.hour
data = data.dropna(subset=['Crash Type', 'Speed Limit', 'Time'])

  data['Time'] = pd.to_datetime(data['Time'], errors='coerce').dt.hour


In [None]:
# Split 60:20:20 (train:validation:test)
train, temp_data = train_test_split(
    data,
    test_size=0.4,
    random_state=42,
    stratify=data['Crash Type']  # Maintain class balance
)

val, test = train_test_split(
    temp_data,
    test_size=0.5, # 50% of 40%
    random_state=42,
    stratify=temp_data['Crash Type']
)

In [None]:
# Separate features and target
features = [
    'State', 'Speed Limit', 'National Road Type',                                         # Road features
    'Road User', 'Age', 'Gender',                                                         # Victim features
    'Bus Involvement', 'Articulated Truck Involvement', 'Heavy Rigid Truck Involvement',  # Vehicle involvement
    'Dayweek', 'Time','Christmas Period', 'Easter Period'                                 # Temporal features
]
target = 'Crash Type'

# Feature types
numeric_features = ['Time', 'Speed Limit', 'Age']
categorical_features = ['State', 'Road User', 'Gender', 'Bus Involvement', 'Heavy Rigid Truck Involvement', 'Articulated Truck Involvement', 'Christmas Period', 'Easter Period']
ordinal_features = ['Dayweek', 'National Road Type']

preprocessor = ColumnTransformer([
    ('numerical', StandardScaler(), numeric_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ('ordinal', OrdinalEncoder(), ordinal_features)
])

# Transform the features
X_train = preprocessor.fit_transform(train[features])
X_val = preprocessor.transform(val[features])
X_test = preprocessor.transform(test[features])

# Convert the targets to binary variables for classification
y_train = train[target].map({'Single': 0, 'Multiple': 1}).values
y_val = val[target].map({'Single': 0, 'Multiple': 1}).values
y_test = test[target].map({'Single': 0, 'Multiple': 1}).values

In [None]:
# Save data
joblib.dump(X_train, '/content/drive/MyDrive/ACTL3143_project/processed/X_train.pkl')
joblib.dump(X_val, '/content/drive/MyDrive/ACTL3143_project/processed/X_val.pkl')
joblib.dump(X_test, '/content/drive/MyDrive/ACTL3143_project/processed/X_test.pkl')
joblib.dump(y_train, '/content/drive/MyDrive/ACTL3143_project/processed/y_train.pkl')
joblib.dump(y_val, '/content/drive/MyDrive/ACTL3143_project/processed/y_val.pkl')
joblib.dump(y_test, '/content/drive/MyDrive/ACTL3143_project/processed/y_test.pkl')
joblib.dump(preprocessor, '/content/drive/MyDrive/ACTL3143_project/processed/preprocessor.pkl')

['/content/drive/MyDrive/ACTL3143_project/processed/preprocessor.pkl']