In [1]:
import os
import json
#import joblib
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

DATA_PROCESSED = "../data/processed/tabular"
MODELS_DIR = "../models"
RESULTS_DIR = "../results/metrics"

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Files produced by 06_split_data.ipynb
TRAIN_PATH = f"{DATA_PROCESSED}/train.csv"
VAL_PATH   = f"{DATA_PROCESSED}/val.csv"
TEST_PATH  = f"{DATA_PROCESSED}/test.csv"

df_train = pd.read_csv(TRAIN_PATH, low_memory=False, parse_dates=True)
df_val   = pd.read_csv(VAL_PATH, low_memory=False, parse_dates=True)
df_test  = pd.read_csv(TEST_PATH, low_memory=False, parse_dates=True)

print(df_train.shape, df_val.shape, df_test.shape)
df_train.head(4)

(24545, 16) (5260, 16) (5260, 16)


Unnamed: 0,shipment_id,origin,destination,dispatch_date,delivery_date,delay_days,disruption_type,risk_score,source,lead_time_days,delay_severity,month,weekday,quarter,year,route_risk_score
0,O1000,B33,S23,2023-10-27 00:00:00,2023-10-28,0.0,,0.0,resilience,1.0,Minor,10,4,4,2023,1.0
1,O1001,B1,S20,2023-07-08 00:00:00,2023-07-09,0.0,,0.0,resilience,1.0,Minor,7,5,3,2023,1.0
2,O1002,B2,S10,2023-12-29 00:00:00,2024-01-07,7.0,Shortage,1.0,resilience,9.0,Severe,12,4,4,2023,1.0
3,O1003,B6,S10,2023-01-17 00:00:00,2023-01-20,0.0,,0.0,resilience,3.0,Moderate,1,1,1,2023,1.0


Utility: Feature/Target Auto-Detection

In [2]:
def find_col(candidates, cols):
    """Return the first column from candidates that exists in cols; else None."""
    for c in candidates:
        if c in cols: 
            return c
    return None

cols = [c.lower() for c in df_train.columns]
col_map = {c.lower(): c for c in df_train.columns}

# Classification target (disruption flag)
classification_candidates = [
    "disruption_flag","is_disrupted","disrupted","risk_flag",
    "has_disruption","disruption","incident_flag", "disruption_type"
]
clf_target_lc = find_col(classification_candidates, cols)

# Regression target (delay days)
regression_candidates = [
    "delay_days","delivery_delay_days","delay","days_delayed","delay_in_days"
]
reg_target_lc = find_col(regression_candidates, cols)

# ID & date columns
id_candidates = ["shipment_id","id","order_id","consignment_id"]
date_candidates = ["dispatch_date","ship_date","event_time","timestamp","created_at","pickup_date"]

id_col_lc = find_col(id_candidates, cols)
date_col_lc = find_col(date_candidates, cols)

# Resolve to original column names (case-preserving)
clf_target = col_map.get(clf_target_lc) if clf_target_lc else None
reg_target = col_map.get(reg_target_lc) if reg_target_lc else None
id_col     = col_map.get(id_col_lc) if id_col_lc else None
date_col   = col_map.get(date_col_lc) if date_col_lc else None

print("Classification target:", clf_target)
print("Regression target:", reg_target)
print("ID column:", id_col)
print("Date column:", date_col)

# Guardrails
if clf_target is None and reg_target is None:
    raise ValueError("No target columns detected. Please set clf_target/reg_target manually.")

Classification target: disruption_type
Regression target: delay_days
ID column: shipment_id
Date column: dispatch_date


Split Features

In [3]:
# Dropping obvious non-features
drop_cols = set([id_col, date_col, clf_target, reg_target]) - {None}
X_train_full = df_train.drop(columns = [c for c in drop_cols if c in df_train.columns])
X_val_full   = df_val.drop(columns = [c for c in drop_cols if c in df_val.columns])
X_test_full  = df_test.drop(columns = [c for c in drop_cols if c in df_test.columns])

#Targets
y_train_clf = df_train[clf_target] if clf_target in df_train.columns else None
y_val_clf   = df_val[clf_target]   if clf_target in df_val.columns else None
y_test_clf  = df_test[clf_target]  if clf_target in df_test.columns else None

y_train_reg = df_train[reg_target] if reg_target in df_train.columns else None
y_val_reg   = df_val[reg_target]   if reg_target in df_val.columns else None
y_test_reg  = df_test[reg_target]  if reg_target in df_test.columns else None

num_cols = [c for c in X_train_full.columns if pd.api.types.is_numeric_dtype(X_train_full[c])]
cat_cols = [c for c in X_train_full.columns if c not in num_cols]

len(num_cols), len(cat_cols), num_cols[:5], cat_cols[:5]

(7,
 5,
 ['risk_score', 'lead_time_days', 'month', 'weekday', 'quarter'],
 ['origin', 'destination', 'delivery_date', 'source', 'delay_severity'])

Common Preprocess Pipline (Impute + Scale + One-Hot)

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# ---------------------------
# Identify column groups
# ---------------------------

# Special categorical
special_cat_cols = ["disruption_type"] if "disruption_type" in df_train.columns else []

# Binary/flag columns
flag_cols = [c for c in df_train.columns 
             if c.lower() in ["risk_flag","disruption_flag","incident_flag"]]

# Special numeric
special_num_cols = [c for c in df_train.columns 
                    if c.lower() in ["route_risk_score", "lead_time_delays"]]

# General numeric (excluding special numeric)
num_cols = [c for c in df_train.select_dtypes(include=["int64","float64"]).columns
            if c not in special_num_cols and c not in flag_cols]

# General categorical (excluding special cat + flags)
cat_cols = [c for c in df_train.select_dtypes(include="object").columns
            if c not in special_cat_cols and c not in flag_cols]

# ---------------------------
# Pipelines
# ---------------------------

# General numeric
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))
])

# Special numeric (route_risk_score, lead_time_delays) → fill with 0
special_num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ("scaler", StandardScaler(with_mean=False))
])

# Special categorical: disruption_type
special_cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="none")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

# Flags → impute with 0 (no risk/disruption/incident)
flag_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0))
    # no encoding needed, already binary
])

# General categorical
general_cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

# ---------------------------
# Final ColumnTransformer
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("special_num", special_num_pipeline, special_num_cols),
        ("special_cat", special_cat_pipeline, special_cat_cols),
        ("flags", flag_pipeline, flag_cols),
        ("general_cat", general_cat_pipeline, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

print("✅ Preprocessor ready.")
print("Numeric:", num_cols)
print("Special Numeric:", special_num_cols)
print("Special Cat:", special_cat_cols)
print("Flags:", flag_cols)
print("General Cat:", cat_cols)


✅ Preprocessor ready.
Numeric: ['delay_days', 'risk_score', 'lead_time_days', 'month', 'weekday', 'quarter', 'year']
Special Numeric: ['route_risk_score']
Special Cat: ['disruption_type']
Flags: []
General Cat: ['shipment_id', 'origin', 'destination', 'dispatch_date', 'delivery_date', 'source', 'delay_severity']


Classification: Risk Prediction

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_auc_score, average_precision_score,
                             classification_report, confusion_matrix)
from sklearn.linear_model import LogisticRegression

clf_results = {}

if clf_target is not None:
    # Ensure binary target as 0/1 if it is categorical/textual
    def to_binary(y):
        # treat ['none','no','false','0'] as 0; anything else as 1
        if y.dtype == object:
            y = y.fillna("none").str.lower().map(lambda v: 0 if v in ['none', 'no', 'false', '0', 'nan', 'unknown'] else 1)
            return y.astype(int)
        return y.astype(int)