In [1]:
import os
import json
#import joblib
import numpy as np
import pandas as pd

from pathlib import Path
from datetime import datetime

DATA_PROCESSED = "../data/processed/tabular"
MODELS_DIR = "../models"
RESULTS_DIR = "../results/metrics"

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Files produced by 06_split_data.ipynb
TRAIN_PATH = f"{DATA_PROCESSED}/train.csv"
VAL_PATH   = f"{DATA_PROCESSED}/val.csv"
TEST_PATH  = f"{DATA_PROCESSED}/test.csv"

df_train = pd.read_csv(TRAIN_PATH, low_memory=False, parse_dates=True)
df_val   = pd.read_csv(VAL_PATH, low_memory=False, parse_dates=True)
df_test  = pd.read_csv(TEST_PATH, low_memory=False, parse_dates=True)

print(df_train.shape, df_val.shape, df_test.shape)
df_train.head(4)

(24545, 16) (5260, 16) (5260, 16)


Unnamed: 0,shipment_id,origin,destination,dispatch_date,delivery_date,delay_days,disruption_type,risk_score,source,lead_time_days,delay_severity,month,weekday,quarter,year,route_risk_score
0,O1000,B33,S23,2023-10-27 00:00:00,2023-10-28,0.0,,0.0,resilience,1.0,Minor,10,4,4,2023,1.0
1,O1001,B1,S20,2023-07-08 00:00:00,2023-07-09,0.0,,0.0,resilience,1.0,Minor,7,5,3,2023,1.0
2,O1002,B2,S10,2023-12-29 00:00:00,2024-01-07,7.0,Shortage,1.0,resilience,9.0,Severe,12,4,4,2023,1.0
3,O1003,B6,S10,2023-01-17 00:00:00,2023-01-20,0.0,,0.0,resilience,3.0,Moderate,1,1,1,2023,1.0


Utility: Feature/Target Auto-Detection

In [3]:
def find_col(candidates, cols):
    """Return the first column from candidates that exists in cols; else None."""
    for c in candidates:
        if c in cols: 
            return c
    return None

cols = [c.lower() for c in df_train.columns]
col_map = {c.lower(): c for c in df_train.columns}

# Classification target (disruption flag)
classification_candidates = [
    "disruption_flag","is_disrupted","disrupted","risk_flag",
    "has_disruption","disruption","incident_flag", "disruption_type"
]
clf_target_lc = find_col(classification_candidates, cols)

# Regression target (delay days)
regression_candidates = [
    "delay_days","delivery_delay_days","delay","days_delayed","delay_in_days"
]
reg_target_lc = find_col(regression_candidates, cols)

# ID & date columns
id_candidates = ["shipment_id","id","order_id","consignment_id"]
date_candidates = ["dispatch_date","ship_date","event_time","timestamp","created_at","pickup_date"]

id_col_lc = find_col(id_candidates, cols)
date_col_lc = find_col(date_candidates, cols)

# Resolve to original column names (case-preserving)
clf_target = col_map.get(clf_target_lc) if clf_target_lc else None
reg_target = col_map.get(reg_target_lc) if reg_target_lc else None
id_col     = col_map.get(id_col_lc) if id_col_lc else None
date_col   = col_map.get(date_col_lc) if date_col_lc else None

print("Classification target:", clf_target)
print("Regression target:", reg_target)
print("ID column:", id_col)
print("Date column:", date_col)

# Guardrails
if clf_target is None and reg_target is None:
    raise ValueError("No target columns detected. Please set clf_target/reg_target manually.")

Classification target: disruption_type
Regression target: delay_days
ID column: shipment_id
Date column: dispatch_date
