In [1]:
# === Block 1: Setup, ingest, target, quick summary ===
import warnings, os, json, math
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

DATA_PATH = "/kaggle/input/bim-ai-integrated-dataset/bim_ai_civil_engineering_dataset.csv"

# Load
df = pd.read_csv(DATA_PATH)

# Parse dates if present
for col in ["Start_Date", "End_Date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# Basic hygiene
if "Planned_Duration" not in df.columns or "Actual_Duration" not in df.columns:
    raise ValueError("Missing Planned_Duration or Actual_Duration in dataset.")
df = df[df["Planned_Duration"] > 0].copy()

# Target (15% schedule overrun)
df["Delay_Ratio"] = df["Actual_Duration"] / df["Planned_Duration"] - 1.0
df["Is_Delayed"] = (df["Delay_Ratio"] > 0.15).astype(int)

# Summary
n_rows, n_cols = df.shape
pos_rate = df["Is_Delayed"].mean()
print("=== Data Summary ===")
print(f"Rows: {n_rows}, Cols: {n_cols}, Positive rate (Is_Delayed): {pos_rate:.3f}")

# Keep a lightweight id column for later joins/exports
if "Project_ID" not in df.columns:
    df["Project_ID"] = [f"PJT_{i+1}" for i in range(len(df))]

# Save a working copy for later blocks
df.to_csv("df_working.csv", index=False)
print("Saved: df_working.csv")

=== Data Summary ===
Rows: 1000, Cols: 30, Positive rate (Is_Delayed): 0.572
Saved: df_working.csv


In [2]:
# === Block 2: Planning Risk Index v2 (calibrated) ===
import numpy as np, pandas as pd, json, joblib
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

PATH = "df_working.csv"
df = pd.read_csv(PATH, parse_dates=["Start_Date","End_Date"])

# Helpers
def nzstd(x): 
    x = x.astype(float)
    return (x - x.mean())/(x.std()+1e-9)

# Build 4 transparent components (planning/ex-ante only)
comp = {}
comp["resource"] = 0
if "Labor_Hours" in df:               comp["resource"] += nzstd(df["Labor_Hours"])
if "Equipment_Utilization" in df:     comp["resource"] += nzstd(df["Equipment_Utilization"])
if "Material_Usage" in df:            comp["resource"] += nzstd(df["Material_Usage"])

comp["site_env"] = 0
for c in ["Temperature","Humidity","Air_Quality_Index"]:
    if c in df: comp["site_env"] += nzstd(df[c])

comp["schedule"] = 0
if "Planned_Duration" in df:          comp["schedule"] += nzstd(df["Planned_Duration"])
if "Start_Date" in df.columns:
    s = pd.to_datetime(df["Start_Date"], errors="coerce")
    comp["schedule"] += ((s.dt.month - 6.5)/3.8).fillna(0)  # coarse seasonality

comp["cost"] = 0
if {"Planned_Cost","Planned_Duration"}.issubset(df.columns):
    comp["cost"] += nzstd(df["Planned_Cost"]/df["Planned_Duration"])

COMP_DF = pd.DataFrame(comp).fillna(0.0)
y = df["Is_Delayed"].astype(int).values

# Chronological split (80/20) if Start_Date exists; else index order
if "Start_Date" in df.columns and pd.api.types.is_datetime64_any_dtype(df["Start_Date"]):
    order = df["Start_Date"].rank(method="first")
else:
    order = pd.Series(np.arange(len(df)))
idx_sorted = order.sort_values().index
split = int(0.8 * len(idx_sorted))
tr_idx, te_idx = idx_sorted[:split], idx_sorted[split:]

X_train, X_test = COMP_DF.loc[tr_idx].values, COMP_DF.loc[te_idx].values
y_train, y_test = y[tr_idx], y[te_idx]

# Calibrated logistic for probabilities (readable, stable)
base = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
clf = CalibratedClassifierCV(base, method="sigmoid", cv=3)
clf.fit(X_train, y_train)

p_test = clf.predict_proba(X_test)[:,1]
roc = roc_auc_score(y_test, p_test)
pr  = average_precision_score(y_test, p_test)
brier = brier_score_loss(y_test, p_test)
print(f"Planning Risk (component-calibrated) — ROC-AUC={roc:.3f} | PR-AUC={pr:.3f} | Brier={brier:.3f}")

# Full portfolio probabilities
p_all = clf.predict_proba(COMP_DF.values)[:,1]

out = pd.DataFrame({
    "Project_ID": df["Project_ID"],
    "comp_resource": COMP_DF["resource"],
    "comp_site_env": COMP_DF["site_env"],
    "comp_schedule": COMP_DF["schedule"],
    "comp_cost":     COMP_DF["cost"],
    "p_delay_planning": p_all,
})
out["health_score_planning"] = np.round(100*(1 - out["p_delay_planning"])).astype(int)

# Risk bands: quantiles on calibrated probabilities
labels = ["Low","Medium","High"]
try:
    out["risk_band_planning"] = pd.qcut(out["p_delay_planning"], q=[0,0.33,0.66,1.0],
                                        labels=labels, duplicates="drop")
except Exception:
    cuts = [out["p_delay_planning"].min()-1e-9,
            out["p_delay_planning"].quantile(0.33),
            out["p_delay_planning"].quantile(0.66),
            out["p_delay_planning"].max()+1e-9]
    out["risk_band_planning"] = pd.cut(out["p_delay_planning"], bins=cuts, labels=labels, include_lowest=True)

out.to_csv("brain_planning_v2_outputs.csv", index=False)
print("Saved: brain_planning_v2_outputs.csv")

# Human-readable global weights (auxiliary logistic without calibration)
aux_lr = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
aux_lr.fit(X_train, y_train)
weights = dict(zip(["resource","site_env","schedule","cost"], aux_lr.coef_[0].tolist()))
intercept = float(aux_lr.intercept_[0])
drivers = {"component_weights": weights, "intercept": intercept}
json.dump(drivers, open("brain_planning_component_weights.json","w"), indent=2)
print("Saved: brain_planning_component_weights.json")

# Save calibrated model bundle for what-if
import joblib
joblib.dump({"calibrated_model": clf}, "brain_planning_component_model.pkl")
print("Saved: brain_planning_component_model.pkl")


Planning Risk (component-calibrated) — ROC-AUC=0.475 | PR-AUC=0.568 | Brier=0.243
Saved: brain_planning_v2_outputs.csv
Saved: brain_planning_component_weights.json
Saved: brain_planning_component_model.pkl


In [3]:
# === Block 3: Deterministic Planning Risk Index overlay ===
import numpy as np, pandas as pd

df = pd.read_csv("df_working.csv", parse_dates=["Start_Date","End_Date"])

def z(x):
    x = x.astype(float)
    return (x - x.mean())/(x.std()+1e-9)

# Components (deterministic weights)
w = {"resource":0.35, "site_env":0.25, "schedule":0.25, "cost":0.15}

comp = {}
t = 0
if "Labor_Hours" in df:               t += z(df["Labor_Hours"])
if "Equipment_Utilization" in df:     t += z(df["Equipment_Utilization"])
if "Material_Usage" in df:            t += z(df["Material_Usage"])
comp["resource"] = t

t = 0
for c in ["Temperature","Humidity","Air_Quality_Index"]:
    if c in df: t += z(df[c])
comp["site_env"] = t

t = 0
if "Planned_Duration" in df:          t += z(df["Planned_Duration"])
if "Start_Date" in df.columns:
    s = pd.to_datetime(df["Start_Date"], errors="coerce")
    t += ((s.dt.month-6.5)/3.8).fillna(0)
comp["schedule"] = t

t = 0
if {"Planned_Cost","Planned_Duration"}.issubset(df.columns):
    t += z(df["Planned_Cost"]/df["Planned_Duration"])
comp["cost"] = t

risk_idx = w["resource"]*comp["resource"] + w["site_env"]*comp["site_env"] + w["schedule"]*comp["schedule"] + w["cost"]*comp["cost"]
risk_idx = (risk_idx - risk_idx.min())/(risk_idx.max()-risk_idx.min()+1e-9)

out = pd.DataFrame({
    "Project_ID": df["Project_ID"],
    "planning_risk_index": risk_idx
})
out["health_score_planning"] = (100*(1 - out["planning_risk_index"])).round().astype(int)
out["risk_band"] = pd.cut(out["planning_risk_index"], bins=[0,0.33,0.66,1.0], labels=["Low","Medium","High"], include_lowest=True)

out.to_csv("planning_risk_index.csv", index=False)
print("Saved: planning_risk_index.csv")


Saved: planning_risk_index.csv


In [4]:
# === Block 4: What-If engine (planning) + examples ===
import numpy as np, pandas as pd, json, joblib

PATH = "df_working.csv"
df0 = pd.read_csv(PATH, parse_dates=["Start_Date","End_Date"])

bundle = joblib.load("brain_planning_component_model.pkl")   # dict with {"calibrated_model": ...}
model  = bundle["calibrated_model"]

def nzstd_series(s):
    s = s.astype(float)
    return (s - s.mean())/(s.std()+1e-9)

def compute_components(df):
    comp = {}
    # Resource
    t = 0
    if "Labor_Hours" in df:           t += nzstd_series(df["Labor_Hours"])
    if "Equipment_Utilization" in df: t += nzstd_series(df["Equipment_Utilization"])
    if "Material_Usage" in df:        t += nzstd_series(df["Material_Usage"])
    comp["resource"] = t

    # Site/env
    t = 0
    for c in ["Temperature","Humidity","Air_Quality_Index"]:
        if c in df: t += nzstd_series(df[c])
    comp["site_env"] = t

    # Schedule
    t = 0
    if "Planned_Duration" in df:      t += nzstd_series(df["Planned_Duration"])
    if "Start_Date" in df.columns:
        s = pd.to_datetime(df["Start_Date"], errors="coerce")
        t += ((s.dt.month - 6.5)/3.8).fillna(0)
    comp["schedule"] = t

    # Cost
    t = 0
    if {"Planned_Cost","Planned_Duration"}.issubset(df.columns):
        t += nzstd_series(df["Planned_Cost"]/df["Planned_Duration"])
    comp["cost"] = t

    return pd.DataFrame(comp).fillna(0.0)

def predict_prob(COMP):
    return model.predict_proba(COMP.values)[:, 1]

# Base components + probabilities
COMP0   = compute_components(df0)
BASE_P  = predict_prob(COMP0)

# What-if function
def what_if(row_idx, buffer_days=0, delta_utilization=0.0, delta_material_usage=0.0):
    df = df0.copy()

    if "Planned_Duration" in df:
        df.loc[row_idx, "Planned_Duration"] = max(1, df.loc[row_idx, "Planned_Duration"] + buffer_days)
    if "Equipment_Utilization" in df:
        df.loc[row_idx, "Equipment_Utilization"] = df.loc[row_idx, "Equipment_Utilization"] + delta_utilization
    if "Material_Usage" in df:
        df.loc[row_idx, "Material_Usage"] = max(0.0, df.loc[row_idx, "Material_Usage"] + delta_material_usage)

    COMP = compute_components(df)
    p_new  = float(predict_prob(COMP)[row_idx])
    p_base = float(BASE_P[row_idx])
    return {"p_base": p_base, "p_new": p_new, "delta": p_new - p_base}

# Slider grids for FE
grid = {
    "buffer_days": [0, 5, 10, 15],
    "delta_utilization": [-0.10, 0.0, 0.10],
    "delta_material_usage": [-50.0, 0.0, 50.0]
}
json.dump(grid, open("what_if_grid.json","w"), indent=2)
print("Saved: what_if_grid.json")

# Examples for top-5 highest-risk projects
top_idx = np.argsort(-BASE_P)[:5]
rows = []
for i in top_idx:
    pid = df0.loc[i, "Project_ID"]
    res = what_if(int(i), buffer_days=10, delta_utilization=-0.05, delta_material_usage=-25.0)
    rows.append({"Project_ID": pid, **res})
pd.DataFrame(rows).to_csv("what_if_examples.csv", index=False)
print("Saved: what_if_examples.csv")


Saved: what_if_grid.json
Saved: what_if_examples.csv


In [5]:
# === Block 5: Top-N portfolio JSON, per-project detail JSON, slide table ===
import json, pandas as pd, numpy as np

scores = pd.read_csv("brain_planning_v2_outputs.csv")
weights = json.load(open("brain_planning_component_weights.json"))
w = weights["component_weights"]

def row_reasons(row, k=3):
    contribs = {
        "resource":  row["comp_resource"]  * w.get("resource", 0.0),
        "site_env":  row["comp_site_env"]  * w.get("site_env", 0.0),
        "schedule":  row["comp_schedule"]  * w.get("schedule", 0.0),
        "cost":      row["comp_cost"]      * w.get("cost", 0.0),
    }
    ordered = sorted(contribs.items(), key=lambda x: abs(x[1]), reverse=True)[:k]
    return [{"factor": f, "contribution": float(c)} for f, c in ordered]

# Top-N portfolio
N = 20
topN = scores.sort_values("p_delay_planning", ascending=False).head(N).copy()
payload = []
for _, r in topN.iterrows():
    payload.append({
        "project_id": r["Project_ID"],
        "p_delay": float(r["p_delay_planning"]),
        "health": int(r["health_score_planning"]),
        "band":   str(r["risk_band_planning"]),
        "reasons": row_reasons(r, k=3)
    })
with open("topN_portfolio.json", "w") as f:
    json.dump({"updated_at": pd.Timestamp.now().isoformat(), "items": payload}, f, indent=2)
print("Saved: topN_portfolio.json")

# Per-project detail dict (all rows)
detail = {}
for _, r in scores.iterrows():
    pid = str(r["Project_ID"])
    detail[pid] = {
        "p_delay": float(r["p_delay_planning"]),
        "health":  int(r["health_score_planning"]),
        "band":    str(r["risk_band_planning"]),
        "components": {
            "resource": float(r["comp_resource"]),
            "site_env": float(r["comp_site_env"]),
            "schedule": float(r["comp_schedule"]),
            "cost":     float(r["comp_cost"])
        },
        "reasons": row_reasons(r, k=3)
    }
with open("project_detail.json", "w") as f:
    json.dump(detail, f, indent=2)
print("Saved: project_detail.json")

# Slide-ready Top-10 table
top10 = topN.head(10)[["Project_ID","p_delay_planning","health_score_planning","risk_band_planning"]].copy()
top10.columns = ["Project_ID","P_Delay","Health","Band"]
top10.to_csv("slide_top10_table.csv", index=False)
print("Saved: slide_top10_table.csv")


Saved: topN_portfolio.json
Saved: project_detail.json
Saved: slide_top10_table.csv


In [6]:
# === Block 6: Package artifacts into a zip for FE/demo handoff ===
import textwrap, zipfile, os
from pathlib import Path

files = [
    "brain_planning_v2_outputs.csv",
    "brain_planning_component_weights.json",
    "brain_planning_component_model.pkl",
    "what_if_grid.json",
    "what_if_examples.csv",
    "planning_risk_index.csv",        # from Block 3 (optional overlay)
    "topN_portfolio.json",
    "project_detail.json",
    "slide_top10_table.csv",
]

readme = textwrap.dedent("""
    PULSE Brain — Planning Risk (Demo Payloads)

    Use these to mock API responses in the FE:
    - GET /drivers       -> brain_planning_component_weights.json
    - GET /portfolio     -> topN_portfolio.json
    - GET /project/:id   -> project_detail.json[":id"]
    - GET /what-if-grid  -> what_if_grid.json

    Core fields:
    - p_delay_planning (0..1), Health = round(100 * (1 - p_delay_planning))
    - Bands: quantiles over p_delay_planning -> Low/Medium/High
    - Reasons: component-based contributions (resource/site_env/schedule/cost)
""").strip()
Path("README_BRAIN.txt").write_text(readme, encoding="utf-8")
files.append("README_BRAIN.txt")

zip_path = "PULSE_Brain_Payloads.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for f in files:
        if os.path.exists(f):
            z.write(f)
print(f"Saved: {zip_path}")


Saved: PULSE_Brain_Payloads.zip
