In [1]:
from pathlib import Path
import yaml

from src.crispdm.core.logging_utils_core import init_logging, build_log_file, get_logger
from src.crispdm.config.load_loader_config import load_and_resolve, load_yaml, find_unresolved_placeholders
from src.crispdm.config.schema_dto_config import ProjectConfig

# ---------- Paths (IMPORTANT) ----------
PROJECT_ROOT = Path.cwd().parent           # notebook inside /notebooks
OUT_ROOT = PROJECT_ROOT / "out"
CONFIG_ROOT = PROJECT_ROOT / "config"

preset_path = CONFIG_ROOT / "pipelines" / "classification_pipeline_config.yml"
dataset_cfg_path = CONFIG_ROOT / "datasets" / "dataset_config.yml"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("preset_path exists?", preset_path.exists(), preset_path)
print("dataset_cfg_path exists?", dataset_cfg_path.exists(), dataset_cfg_path)
print("OUT_ROOT:", OUT_ROOT)

# ---------- Logging ----------
log_file = build_log_file(output_root=OUT_ROOT, run_name="config_smoke_test")
init_logging(log_file, level="DEBUG")
log = get_logger(__name__)
log.info("=== CONFIG SMOKE TEST START ===")

# ---------- 1) Load dataset_config.yml (if present) ----------
dataset_vars = {}
if dataset_cfg_path.exists():
    ds_cfg = load_yaml(dataset_cfg_path)
    dataset_id = "microsoft_security_incident"  # change if your id is different
    ds = ds_cfg["datasets"][dataset_id]

    dataset_vars = {
        "dataset_path": ds["paths"]["train"],
        "target_col": ds.get("columns_hint", {}).get("target_col"),
        "time_col": ds.get("columns_hint", {}).get("time_col"),
        "id_cols": ds.get("columns_hint", {}).get("id_cols"),
    }
    log.info("Loaded dataset_config.yml dataset_id=%s train=%s", dataset_id, dataset_vars["dataset_path"])
else:
    log.warning("dataset_config.yml not found. Using notebook vars only.")

# ---------- 2) Notebook overrides (always win) ----------
runtime_vars = dict(dataset_vars)
runtime_vars.update({
    "dataset_path": str(PROJECT_ROOT / "data/raw/train/GUIDE_Train.csv"),  # override example
    "target_col": None,
    "time_col": None,
    "id_cols": None,
})

runtime_vars["output_root"] = str(OUT_ROOT)
# ---------- 3) Load + resolve pipeline YAML ----------
loaded = load_and_resolve(preset_path, runtime_vars=runtime_vars)

log.info("YAML loaded from: %s", loaded.source_path)
log.info("Top keys (raw): %s", list(loaded.raw.keys()))
log.info("Top keys (resolved): %s", list(loaded.resolved.keys()))
log.info("Merged variables keys: %s", sorted(list(loaded.variables.keys())))

has_unresolved, n = find_unresolved_placeholders(loaded.resolved)
log.info("Unresolved placeholders? %s count=%d", has_unresolved, n)

# ---------- 4) Build typed config (DTO) ----------
cfg = ProjectConfig.from_dict(loaded.resolved)
log.info("DTO ok: pipeline=%s task=%s output_root=%s",
         cfg.pipeline.name, cfg.pipeline.task.value, cfg.runtime.output_root)

# ---------- 5) Save audit snapshot (config_used.yml) ----------
#audit_dir = OUT_ROOT / "audit"
#audit_dir.mkdir(parents=True, exist_ok=True)
#config_used_path = audit_dir / "config_used.yml"
#config_used_path.write_text(yaml.safe_dump(loaded.resolved, sort_keys=False), encoding="utf-8")
#log.info("Saved audit config_used.yml at: %s", config_used_path)
#print("Saved:", config_used_path)


from crispdm.reporting.audit_service_reporting import save_config_used

# IMPORTANT: prefer absolute OUT_ROOT (like you already fixed)
save_config_used(
    loaded.resolved,
    task=cfg.pipeline.task,
    output_root=OUT_ROOT
)
log.info("Saved audit config_used.yml at: %s", save_config_used)
print("Saved:", save_config_used)
log.info("=== CONFIG SMOKE TEST DONE ===")


[07:39:43] [DEBUG] crispdm - Logging initialized. log_file=K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\out\logs\config_smoke_test_20260129_073943.log level=DEBUG
[07:39:43] [INFO] crispdm.__main__ - === CONFIG SMOKE TEST START ===
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - load_yaml: path=K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\config\datasets\dataset_config.yml
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - load_yaml: loaded keys=['version', 'datasets']
[07:39:43] [INFO] crispdm.__main__ - Loaded dataset_config.yml dataset_id=microsoft_security_incident train=data/raw/train/GUIDE_Train.csv
[07:39:43] [INFO] crispdm.src.crispdm.config.load_loader_config - load_and_resolve: start path=K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\config\pipelines\classification_pipeline_config.yml
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - load_and_

PROJECT_ROOT: K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction
preset_path exists? True K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\config\pipelines\classification_pipeline_config.yml
dataset_cfg_path exists? True K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\config\datasets\dataset_config.yml
OUT_ROOT: K:\00_Code\DataScience\Project_DS_Microsoft_Security_Incident_Prediction\out


[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - load_yaml: loaded keys=['version', 'pipeline', 'runtime', 'stages']
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - merge_variables: yaml_vars=['dataset_path', 'id_cols', 'target_col', 'time_col'] runtime_vars=['dataset_path', 'id_cols', 'output_root', 'target_col', 'time_col']
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - _resolve_string: whole-token key=dataset_path -> 'K:\\00_Code\\DataScience\\Project_DS_Microsoft_Security_Incident_Prediction\\data\\raw\\train\\GUIDE_Train.csv'
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - _resolve_string: whole-token key=target_col -> None
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - _resolve_string: whole-token key=time_col -> None
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loader_config - _resolve_string: whole-token key=id_cols -> None
[07:39:43] [DEBUG] crispdm.src.crispdm.config.load_loa

Unresolved placeholders? False count: 0
Saved: <function save_config_used at 0x0000027087F69BD0>
