### Base config

In [1]:
from __future__ import annotations

from shared.common.config import configure_logger, load_settings, load_config
from shared.common.utils import (
    canonical_json_dumps,
    set_global_seed,
    get_utc_now,
    normalize_location_weights,
    derive_city_mappings,
)

# 1) Logging setup
logger = configure_logger(name="dataset_generator", level="INFO")

# 2) Load central configuration (prefer typed PipelineConfig)
try:
    SETTINGS = load_settings("./dataset_config.yaml")   # typed (PipelineConfig)
    GEN = SETTINGS.generation                           # pydantic model (GenerationConfig)
    GEN_CFG = GEN.model_dump()                          # plain dict for downstream functions
except Exception:
    # Fallback: dict-only config (backward compatibility)
    CONFIG = load_config("./dataset_config.yaml")
    GEN_CFG = CONFIG.get("generation", CONFIG) or {}

# 3) Deterministic seeding
SEED = int(GEN_CFG.get("seed", 42))
rng = set_global_seed(SEED)

# 4) Reference time (UTC, ISO8601 con 'Z', no ms)
REFERENCE_TIME = get_utc_now().replace(microsecond=0)
REFERENCE_TIME_ISO = REFERENCE_TIME.isoformat().replace("+00:00", "Z")

# 5) Location weights
location_weights_cfg = GEN_CFG.get("location_weights", {}) or {}
if not location_weights_cfg:
    logger.error("location_weights missing in config.")
    raise ValueError("location_weights not defined")
normalized_location_weights = normalize_location_weights(location_weights_cfg)

# 6) City mappings
LOCATIONS, URBAN_TYPE_BY_CITY, REGION_BY_CITY = derive_city_mappings(
    GEN_CFG,
    urban_override=GEN_CFG.get("urban_type_by_city"),
    region_override=GEN_CFG.get("region_by_city"),
)

# 7) Other derivates
CITY_BASE_PRICES = GEN_CFG.get("city_base_prices", {}) or {}
SEASONALITY = GEN_CFG.get("seasonality", {}) or {}
ZONE_THRESHOLDS = GEN_CFG.get("zone_thresholds_km", {"center": 1.5, "semi_center": 5.0}) or {}

# Validation: thresholds (best-effort)
if not {"center", "semi_center"} <= set(ZONE_THRESHOLDS.keys()):
    raise ValueError("ZONE_THRESHOLDS incomplete: servono 'center' e 'semi_center'")
if float(ZONE_THRESHOLDS["center"]) >= float(ZONE_THRESHOLDS["semi_center"]):
    raise ValueError("'center' threshold must be < 'semi_center'")

# 8) Summary
summary = {
    "seed": SEED,
    "reference_time": REFERENCE_TIME_ISO,
    "locations_count": len(LOCATIONS),
    "rows_to_generate": int(GEN_CFG.get("n_rows", 0)),
    "asset_type": str(GEN_CFG.get("asset_type", "property")),
    "cities_with_base_prices": len(CITY_BASE_PRICES),
}

# 9) Log structured config summary
logger.info("=" * 60)
logger.info("DATASET GENERATION CONFIGURATION")
logger.info("=" * 60)
logger.info("Config summary:\n%s", canonical_json_dumps(summary))

[2025-10-05 12:22:24,827] INFO dataset_generator: DATASET GENERATION CONFIGURATION
[2025-10-05 12:22:24,827] INFO dataset_generator: Config summary:
{"asset_type":"property","cities_with_base_prices":15,"locations_count":15,"reference_time":"2025-10-05T10:22:24Z","rows_to_generate":15000,"seed":42}


### Dataset generation and base enrichment

In [2]:
from __future__ import annotations

import json
from pathlib import Path
from datetime import datetime, timezone

import numpy as np
import pandas as pd

from shared.n01_generate_dataset.dataset_builder import generate_dataset_df
from shared.common.sanity_checks import validate_dataset
from shared.common.schema import get_required_fields
from shared.common.utils import (
    canonical_json_dumps,
    optimize_dtypes,
    log_basic_diagnostics,
)

# 0) Sanity preliminare
assert isinstance(GEN_CFG, dict) and int(GEN_CFG.get("n_rows", 0)) > 0, "generation.n_rows mancante o non valido"

# 1) Gen. dataset + quality_report
logger.info("Starting dataset generation...")
df, quality_report = generate_dataset_df(
    config=GEN_CFG,
    locations=LOCATIONS,
    urban_map=URBAN_TYPE_BY_CITY,
    region_map=REGION_BY_CITY,
    seasonality=SEASONALITY,
    city_base_prices=CITY_BASE_PRICES,
    rng=rng,
    reference_time=REFERENCE_TIME,
    batch_size=1000,
    show_progress=True,
    validate_each=True,
    error_budget_pct=0.01,
)
logger.info(f"✅ Generated {len(df):,} records")

# Output dir
Path("outputs").mkdir(parents=True, exist_ok=True)

# Saves quality_report
with open("outputs/quality_report.json", "w", encoding="utf-8") as f:
    f.write(canonical_json_dumps(quality_report))
logger.info("✅ Quality report saved to outputs/quality_report.json")

# 3) Opt. types
mem_before = df.memory_usage(deep=True).sum()
mem_before_cols = df.memory_usage(deep=True, index=False)
dtypes_before = df.dtypes.copy()

df = optimize_dtypes(df)

mem_after = df.memory_usage(deep=True).sum()
mem_after_cols = df.memory_usage(deep=True, index=False)
dtypes_after = df.dtypes

saved_bytes = mem_before - mem_after
saved_mb = saved_bytes / 1024**2
pct_saved = (saved_bytes / mem_before * 100) if mem_before > 0 else 0.0

logger.info(
    "✅ Data types optimized: %.2f MB → %.2f MB  (−%.2f MB, %.1f%%)",
    mem_before / 1024**2, mem_after / 1024**2, saved_mb, pct_saved
)

SHOW_TOP_SAVINGS = True
TOP_N = 8
if SHOW_TOP_SAVINGS:
    diff = (mem_before_cols - mem_after_cols).sort_values(ascending=False)
    top = {k: round(v / 1024**2, 3) for k, v in diff.head(TOP_N).items() if v > 0}
    if top:
        logger.info("🏁 Top risparmio per colonna (MB): %s", top)

changed = [c for c in df.columns if dtypes_before.get(c) is not None and dtypes_before[c] != dtypes_after[c]]
if changed:
    preview = {c: f"{dtypes_before[c]}→{dtypes_after[c]}" for c in changed[:TOP_N]}
    more = f" (+{len(changed)-TOP_N} altre)" if len(changed) > TOP_N else ""
    logger.info("🔤 Dtype cambiati (%d): %s%s", len(changed), preview, more)

# 4) Schema validation
try:
    validation_report = validate_dataset(
        df,
        asset_type=GEN_CFG.get("asset_type", "property"),
        raise_on_failure=True,
    )
    logger.info("✅ Dataset validation passed")
except RuntimeError as e:
    logger.error(f"❌ Validation failed: {e}")
    validation_report = {
        "overall_passed": False,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "n_rows": len(df),
        "n_cols": df.shape[1],
        "error": str(e),
    }

print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)
if "schema" in validation_report:
    missing = validation_report["schema"].get("missing", [])
    if missing:
        print(f"⚠️ Missing fields: {missing}")
    else:
        print("✅ All required fields present")
print(f"📊 Rows: {validation_report.get('n_rows', len(df)):,}")
print(f"📊 Cols: {validation_report.get('n_cols', df.shape[1])}")
print(f"✅ Validation: {'PASSED' if validation_report.get('overall_passed') else 'FAILED'}")

report_path = Path("outputs/validation_report.json")
report_path.parent.mkdir(parents=True, exist_ok=True)
clean_report = validation_report

try:
    payload = canonical_json_dumps(clean_report)
    with open(report_path, "w", encoding="utf-8") as fp:
        fp.write(payload)
    logger.info(f"✅ Validation report saved to {report_path}")
except (TypeError, ValueError) as e:
    logger.warning(f"JSON encoder issue ({e}), applying fallback sanitization")

    def _convert_to_serializable(obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, (np.integer, np.floating)):
            return float(obj)
        elif isinstance(obj, pd.Timestamp):
            return obj.isoformat()
        elif isinstance(obj, dict):
            return {k: _convert_to_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, (list, tuple, set)):
            return [_convert_to_serializable(item) for item in obj]
        return obj

    clean_report = _convert_to_serializable(validation_report)
    with open(report_path, "w", encoding="utf-8") as fp:
        fp.write(canonical_json_dumps(clean_report))
    logger.info(f"✅ Validation report (fallback) saved to {report_path}")

df.attrs["validation_timestamp"] = validation_report.get("timestamp")
df.attrs["validation_passed"] = bool(validation_report.get("overall_passed", False))

# 5) Column reorder
required = get_required_fields(GEN_CFG.get("asset_type", "property"))
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required fields: {missing}")

others = [c for c in df.columns if c not in required]
df = df[required + others]
logger.info("✅ Columns reordered: %d required + %d optional", len(required), len(others))

# 6) Allineamento opzionale all'ordine feature ufficiale (se presente)
ROOT = Path(".").resolve()
NB_DIR = ROOT if ROOT.name == "notebooks" else (ROOT / "notebooks")
FEATURES_PATH = NB_DIR / "modeling" / "property" / "feature_order.json"

if FEATURES_PATH.exists():
    feat_spec = json.loads(FEATURES_PATH.read_text(encoding="utf-8"))
    # Supporta sia {"feature_order":[...], "dtypes": {...}} sia una semplice lista
    official = feat_spec.get("feature_order") if isinstance(feat_spec, dict) else feat_spec
    official = list(official) if official is not None else []

    # Colonne da portare subito dopo le required
    official_tail = [c for c in official if c not in required and c in df.columns]

    # Il resto (ordine stabile, escludendo già prese)
    taken = set(required) | set(official_tail)
    others_tail = [c for c in df.columns if c not in taken]

    df = df[required + official_tail + others_tail]

    # (Opzionale) cast dtypes se specificati nella feature spec
    dtypes_map = feat_spec.get("dtypes", {}) if isinstance(feat_spec, dict) else {}
    for col, dtype in dtypes_map.items():
        if col in df.columns:
            try:
                df[col] = df[col].astype(dtype)
            except Exception as e:
                logger.warning("Cast dtype failed for  %s→%s: %s", col, dtype, e)

    logger.info(
        "✅ Columns realigned to official feature order: %d required + %d official + %d others",
        len(required), len(official_tail), len(others_tail)
    )
else:
    logger.info("ℹ️ feature_order.json not found (%s); 'official'", FEATURES_PATH)

# 7) Diagnostics rapidi
log_basic_diagnostics(df, logger)

print("\n" + "="*60)
print("DATASET GENERATION COMPLETED")
print("="*60)
print(f"✅ Records: {len(df):,}")
print(f"✅ Features: {df.shape[1]}")
print(f"✅ Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"✅ Validation: {'PASSED' if df.attrs.get('validation_passed') else 'FAILED'}")

[2025-10-05 12:22:24,858] INFO dataset_generator: Starting dataset generation...
100%|██████████| 15000/15000 [00:30<00:00, 498.52it/s]
[2025-10-05 12:22:55,088] INFO dataset_generator: ✅ Generated 15,000 records
[2025-10-05 12:22:55,088] INFO dataset_generator: ✅ Quality report saved to outputs/quality_report.json
[2025-10-05 12:22:55,135] INFO dataset_generator: ✅ Data types optimized: 6.70 MB → 4.61 MB  (−2.09 MB, 31.2%)
[2025-10-05 12:22:55,135] INFO dataset_generator: 🏁 Top risparmio per colonna (MB): {'listing_month': 0.072, 'size_m2': 0.072, 'has_elevator': 0.072, 'is_ground_floor': 0.072, 'is_top_floor': 0.072, 'building_floors': 0.072, 'floor': 0.072, 'age_years': 0.072}
[2025-10-05 12:22:55,135] INFO dataset_generator: 🔤 Dtype cambiati (31): {'valuation_k': 'float64→float32', 'price_per_sqm': 'float64→float32', 'listing_month': 'int64→Int16', 'size_m2': 'int64→Int16', 'rooms': 'int64→Int16', 'bathrooms': 'int64→Int16', 'year_built': 'int64→Int16', 'age_years': 'int64→Int16'} 


VALIDATION SUMMARY
✅ All required fields present
📊 Rows: 15,000
📊 Cols: 43
✅ Validation: PASSED

DATASET GENERATION COMPLETED
✅ Records: 15,000
✅ Features: 43
✅ Memory: 4.61 MB
✅ Validation: PASSED


### Profiling & optimizations

In [3]:
from __future__ import annotations

from pathlib import Path
import pandas as pd

from shared.common.performance_utils import DatasetProfiler, DtypeOptimizer
from shared.common.reports import run_sanity_checks
from shared.common.utils import canonical_json_dumps

# 1) Pre-profiling casts & parsing (best effort)
for col, dtype in {
    "price_per_sqm_capped": "float32",
    "listing_quarter": "category",
    "location_premium": "float32",
}.items():
    if col in df.columns:
        try:
            df[col] = df[col].astype(dtype)
        except Exception:
            pass  # best-effort

if "last_verified_ts" in df.columns:
    df["last_verified_ts"] = pd.to_datetime(df["last_verified_ts"], utc=True, errors="coerce")

# 2) Inizializza profiler (usa default se non configurato)
profiler = DatasetProfiler(
    float_downcast_atol=1e-6,
    float_downcast_rtol=1e-3,
)

# 3) Esegui profiling completo
print("🔍 Profiling dataset performance...")
profile_results = profiler.profile(
    df,
    groupby_observed=bool(GEN_CFG.get("groupby_observed", True)),
)

# 4) Report memoria
mem = profile_results.get("memory", {}) or {}
print("\n=== MEMORY PROFILE ===")
print(f"Total memory: {mem.get('total_mb', 0.0):.2f} MB")
print(f"Memory per row: {mem.get('per_row_kb', 0.0):.3f} KB")
print("\nMemory by dtype:")
for dtype, info in (mem.get("by_dtype", {}) or {}).items():
    print(f"  {dtype}: {info.get('mb', 0.0):.2f} MB ({info.get('pct', 0.0):.1f}%) - {info.get('n_columns', 0)} cols")

# 5) Benchmark performance
perf = profile_results.get("performance", {}) or {}
print("\n=== PERFORMANCE BENCHMARKS ===")
if "groupby" in perf and "time_ms" in perf["groupby"]:
    gb = perf["groupby"]
    cols = gb.get("columns") or []
    print(f"GroupBy ({', '.join(cols) if cols else '?' }): {gb['time_ms']:.2f} ms")
    if gb.get("rows_per_sec") is not None:
        print(f"  → {gb['rows_per_sec']:,} rows/sec")
if "sort" in perf and "time_ms" in perf["sort"]:
    srt = perf["sort"]
    print(f"Sort by {srt.get('column','?')}: {srt['time_ms']:.2f} ms")

# 6) Suggerimenti dtype
dtype_sugg = profile_results.get("dtype_optimization") or {}
print("\n=== DTYPE OPTIMIZATION SUGGESTIONS ===")
if dtype_sugg:
    for col, opt in list(dtype_sugg.items())[:10]:
        print(f"  {col}: {opt.get('current','?')} → {opt.get('target','?')} ({opt.get('reason','')})")
        if "memory_reduction_pct" in opt:
            print(f"    → Mem saving: {opt['memory_reduction_pct']}%")
else:
    print("No optimizations suggested")

# 7) Suggerimenti indici
print("\n=== INDEX SUGGESTIONS ===")
for idx in profile_results.get("index_suggestions", []) or []:
    mark = "✅" if idx.get("utility") in {"excellent", "good"} else "❌"
    print(f"  {mark} {idx.get('column','?')}: {idx.get('utility','?')} ({idx.get('reason','')})")

# 8) Applica ottimizzazioni automatiche (facoltativo)
AUTO_APPLY_OPTIMIZATIONS = True
if AUTO_APPLY_OPTIMIZATIONS and dtype_sugg:
    print("\n🔧 Applying dtype optimizations...")
    optimizer = DtypeOptimizer()
    try:
        df, opt_report = optimizer.apply(
            df,
            dtype_sugg,
            inplace=False,
        )
    except TypeError:
        # Fallback per versioni più vecchie (positional only)
        df, opt_report = optimizer.apply(df, dtype_sugg, False)

    summary = opt_report.get("summary", {}) or {}
    print(
        f"Applied: {summary.get('applied', 0)}, "
        f"Skipped: {summary.get('skipped', 0)}, "
        f"Failed: {summary.get('failed', 0)}"
    )

    # Delta memoria rispetto al profilo pre-ottimizzazione
    total_before = float(mem.get("total_mb", 0.0) or 0.0)
    new_mem_mb = df.memory_usage(deep=True).sum() / 1024**2
    saved = total_before - new_mem_mb
    if saved > 0:
        base = total_before if total_before > 0 else 1e-9
        print(f"✅ Memory saved: {saved:.2f} MB ({(saved / base) * 100:.1f}%)")

# 9) Salva profiling report (bugfix: scrive effettivamente su file)
SAVE_PROFILING_REPORT = True
if SAVE_PROFILING_REPORT:
    log_dir = GEN_CFG.get("paths", {}).get("log_dir", "./logs")
    report_path = Path(log_dir) / "profiling_report.json"
    report_path.parent.mkdir(parents=True, exist_ok=True)
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(canonical_json_dumps(profile_results))
    logger.info("Profiling report saved to: %s", report_path)

# =========================
# SANITY BENCHMARKS & DRIFT
# =========================
sanity_report, df = run_sanity_checks(df, GEN_CFG)

logs_dir = Path("./outputs")
logs_dir.mkdir(parents=True, exist_ok=True)
sanity_path = logs_dir / "sanity_report.json"

with sanity_path.open("w", encoding="utf-8") as fp:
    fp.write(canonical_json_dumps(sanity_report))

if not sanity_report.get("all_passed", True):
    raise RuntimeError(f"Sanity checks failed – see details in {sanity_path}")

logger.info("✅ Sanity checks passed; report saved in %s", sanity_path)

[2025-10-05 12:22:55,243] INFO dataset_generator: Profiling report saved to: C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\logs\profiling_report.json


🔍 Profiling dataset performance...

=== MEMORY PROFILE ===
Total memory: 3.62 MB
Memory per row: 0.247 KB

Memory by dtype:
  object: 1.92 MB (52.9%) - 2 cols
  category: 0.13 MB (3.7%) - 9 cols
  float32: 0.52 MB (14.2%) - 9 cols
  datetime64[ns, UTC]: 0.11 MB (3.2%) - 1 cols
  Int16: 0.94 MB (26.0%) - 22 cols

=== PERFORMANCE BENCHMARKS ===
GroupBy (location, energy_class): 1.16 ms
  → 12,969,047 rows/sec
Sort by valuation_k: 2.05 ms

=== DTYPE OPTIMIZATION SUGGESTIONS ===
  asset_type: object → category (low cardinality (0.0%))
  listing_month: Int16 → uint8 (values in [10, 10])
    → Mem saving: 50%
  size_m2: Int16 → uint8 (values in [40, 199])
    → Mem saving: 50%
  rooms: Int16 → uint8 (values in [2, 7])
    → Mem saving: 50%
  bathrooms: Int16 → uint8 (values in [1, 3])
    → Mem saving: 50%
  year_built: Int16 → uint16 (values in [1950, 2023])
    → Mem saving: 0%
  age_years: Int16 → uint8 (values in [2, 75])
    → Mem saving: 50%
  floor: Int16 → uint8 (values in [0, 5])
  

[2025-10-05 12:22:55,405] INFO dataset_generator: ✅ Sanity checks passed; report saved in outputs\sanity_report.json


### Export

In [None]:
from __future__ import annotations

import re
import hashlib
from datetime import datetime, timezone
from pathlib import Path

from shared.n01_generate_dataset.exporter import export_dataset
from shared.common.utils import canonical_json_dumps

# ---- Path helpers (non-breaking): still under notebooks/ if relative ----
ROOT = Path(".").resolve()
NB_DIR = ROOT if ROOT.name == "notebooks" else (ROOT / "notebooks")

def _resolve_under_nbdir(p: str | Path, default: str = "outputs") -> Path:
    p = Path(str(p or default)).expanduser()
    return p if p.is_absolute() else (NB_DIR / p)

# === Desired output locations ===
OUT_DIR = _resolve_under_nbdir(GEN_CFG.get("paths", {}).get("out_dir", "outputs"))
OUT_DIR.mkdir(parents=True, exist_ok=True)

FILE_FORMAT = str(GEN_CFG.get("export_format", "csv")).lower()
assert FILE_FORMAT in {"csv", "parquet"}, f"Unsupported export_format: {FILE_FORMAT}"
COMPRESSION = GEN_CFG.get("compression", None if FILE_FORMAT == "csv" else "snappy")
NO_OVERWRITE = bool(GEN_CFG.get("no_overwrite", False))

# ---- sanitized filename base ----
_raw_base = str(GEN_CFG.get("filename_prefix") or GEN_CFG.get("name") or "dataset_generated")
BASE_NAME = re.sub(r"[^A-Za-z0-9._-]+", "_", _raw_base).strip("._-") or "dataset_generated"
ext = "parquet" if FILE_FORMAT == "parquet" else "csv"
output_path = OUT_DIR / f"{BASE_NAME}.{ext}"

snapshot_dir = OUT_DIR / "snapshots"
log_dir = OUT_DIR / "logs"
snapshot_dir.mkdir(parents=True, exist_ok=True)
log_dir.mkdir(parents=True, exist_ok=True)

# Build config to feed the exporter (non-breaking)
EXPORT_CFG = {
    **GEN_CFG,
    "paths": {
        **GEN_CFG.get("paths", {}),
        "output_path": str(output_path),
        "snapshot_dir": str(snapshot_dir),
        "log_dir": str(log_dir),
    },
}

# === Export dataset ===
# Use the rich reports created earlier: quality_report (cell 02), validation clean_report (cell 02),
# and sanity_report (cell 03). Fall back gracefully if any is missing.
combined_report = {
    "quality_report": locals().get("quality_report", {}),
    "validation_report": locals().get("clean_report", {}),
    "sanity_report": locals().get("sanity_report", {}),
}

manifest = export_dataset(
    df=df,
    config=EXPORT_CFG,
    report=combined_report,
    logger=logger,
    format=FILE_FORMAT,
    compression=COMPRESSION,
    index=False,
    no_overwrite=NO_OVERWRITE,
)

# ---- Enrich in-memory manifest view with a few convenient extras (no overwrite if already present) ----
def _sha256(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()

# Resolve dataset path from manifest (new exporter structure)
ds_path = None
try:
    ds_path_str = (manifest or {}).get("paths", {}).get("dataset")
    if ds_path_str:
        ds_path = Path(ds_path_str)
except Exception:
    ds_path = None
if not ds_path or not ds_path.exists():
    # Fallback candidates under OUT_DIR
    for cand in [output_path, OUT_DIR / f"{BASE_NAME}.csv", OUT_DIR / f"{BASE_NAME}.parquet"]:
        if cand.exists():
            ds_path = cand
            break

extras = {}
try:
    if ds_path and ds_path.exists():
        extras["sha256"] = _sha256(ds_path)
        extras["rows"] = int(len(df))
        extras["cols"] = list(map(str, df.columns))
    # feature_order signature (optional)
    feat_json = NB_DIR / "modeling" / "property" / "feature_order.json"
    if feat_json.exists():
        extras["feature_order_sha256"] = _sha256(feat_json)
except Exception:
    pass

# add seed + utc timestamp
extras["seed"] = int(SEED)
extras["ts_utc"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

if isinstance(manifest, dict):
    for k, v in extras.items():
        manifest.setdefault(k, v)

# Persist an export summary alongside logs for convenience
summary_path = log_dir / "export_summary.json"
summary_path.write_text(canonical_json_dumps(manifest), encoding="utf-8")
logger.info("Export summary saved to %s", summary_path)

# Artifact logging
try:
    size_mb = (ds_path.stat().st_size / 1024**2) if (ds_path and ds_path.exists()) else 0.0
    logger.info(
        "Artifact written: %s (%.2f MB) – rows=%s, cols=%s",
        ds_path, size_mb, f"{len(df):,}", df.shape[1]
    )
except Exception:
    pass

# Optional: quick descriptive snapshot
try:
    (df.describe(include="all")
       .T.head(20)
       .to_csv(OUT_DIR / "describe_snapshot.csv", encoding="utf-8"))
except Exception:
    pass

# ------ Location drift check (from sanity_report built in Cell 03) ------
drift_info = (combined_report.get("sanity_report", {}) or {}).get("sanity_benchmarks", {}) or {}
drift_info = drift_info.get("location_drift", {}) if isinstance(drift_info, dict) else {}

tolerance = float(GEN_CFG.get("drift_tolerance", 0.15))

ANALYSIS_DIR = OUT_DIR / "analysis"
ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)
(ANALYSIS_DIR / "location_drift_generation.json").write_text(
    canonical_json_dumps(drift_info),
    encoding="utf-8"
)

violating = []
if isinstance(drift_info, dict) and "by_location" in drift_info:
    for loc, info in (drift_info.get("by_location") or {}).items():
        try:
            diff = float(info.get("difference", 0.0))
            if abs(diff) > tolerance:
                violating.append(loc)
        except Exception:
            continue

if violating:
    raise ValueError(f"Excessive location drift for: {violating} (tolerance ±{tolerance})")

# ------ Manifest recap in logs (new schema-aware) ------
wanted = [
    "generated_at", "manifest_hash", "sha256", "feature_order_sha256",
    "rows", "seed", "ts_utc"
]
manifest_summary = {k: (manifest or {}).get(k) for k in wanted}
paths_summary = (manifest or {}).get("paths", {})
logger.info("Export completed with manifest:")
logger.info(canonical_json_dumps({"summary": manifest_summary, "paths": paths_summary}))

[2025-10-05 12:22:55,717] INFO dataset_generator: ✅ Saved dataset to C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\dataset_generated.csv
[2025-10-05 12:22:55,717] INFO dataset_generator: ✅ Saved quality report JSON to C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\logs\quality_report.json
[2025-10-05 12:22:55,726] INFO dataset_generator: ✅ Saved top 30 outliers to C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\logs\top_outliers.csv
[2025-10-05 12:22:55,879] INFO dataset_generator: ✅ Saved manifest to C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\snapshots\manifest_20251005T102255Z.json (hash=79e88bbabaae852e3d560715fa736053000e1c17945cac2403e0271849c938d4)
[2025-10-05 12:22:55,884] INFO dataset_generator: Export summary saved to C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\logs\export_summary.json
[2025-10-05 12:22:55,884] INFO dataset_gen