### Imports

In [None]:
# 00. Imports
from __future__ import annotations

import json
from pathlib import Path

import numpy as np                  # type: ignore
import pandas as pd                 # type: ignore
import matplotlib.pyplot as plt     # type: ignore

from notebooks.shared.n02_explore_dataset.eda_core import (
    DescriptiveAnalyzer, OutlierDetector, TemporalAnalyzer,
    StatisticalTester, FeatureImportanceAnalyzer, ensure_temporal_columns,
    plot_correlation_heatmap,
)
from notebooks.shared.n02_explore_dataset.eda_reports import InsightsAnalyzer, EDAReportRunner
from notebooks.shared.common.utils import NumpyJSONEncoder, log_basic_diagnostics
from notebooks.shared.common.constants import (
    VALUATION_K, ENERGY_CLASS, CONDITION_SCORE, RISK_SCORE,
    LUXURY_SCORE, ENV_SCORE, SIZE_M2, LAG_HOURS, LOCATION
)

plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["figure.dpi"] = 110

### Setup

In [None]:
# 01: Setup
ANALYSIS_DIR = Path("outputs/analysis"); ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR = ANALYSIS_DIR / "figures"; FIG_DIR.mkdir(parents=True, exist_ok=True)

# helper salvataggio figure
def _savefig(name: str):
    path = FIG_DIR / name
    plt.savefig(path, bbox_inches="tight")
    print(f"📈 Figure saved: {path}")

# carica ultimo manifest di nb01
snapshots_dir = Path("outputs/snapshots")
manifests = sorted(snapshots_dir.glob("manifest_*.json"))
assert manifests, "Nessun manifest trovato in outputs/snapshots. Esegui nb01 export prima."
manifest01_path = manifests[-1]
with open(manifest01_path, "r", encoding="utf-8") as f:
    manifest01 = json.load(f)

dataset_path = Path(manifest01["paths"]["dataset"])
assert dataset_path.exists(), f"Dataset non trovato: {dataset_path}"

# leggi dataset (csv/parquet)
df = pd.read_parquet(dataset_path) if dataset_path.suffix.lower()==".parquet" else pd.read_csv(dataset_path)

# colonne temporali coerenti (best-effort) + diagnostica
df = ensure_temporal_columns(df)
log_basic_diagnostics(df)

print(f"Loaded dataset: {dataset_path}  → rows={len(df):,}, cols={df.shape[1]}")

### Report Loading & Distribution Analysis — 2.1 Upload nb01 report

In [None]:
# 02.1: Carica report di nb01 (robusto, con path assoluto e fallback)
from pathlib import Path

quality_path = Path(manifest01["paths"].get("quality_report_json", ""))
sanity_path  = Path("outputs/sanity_report.json")
drift_path   = Path("outputs/location_drift.json")

# candidati per profiling_report
profiling_candidates = [
    Path(r"C:\Users\Utente\Desktop\Projects\ai_oracle_rwa\logs\profiling_report.json"),
    Path(manifest01.get("paths", {}).get("log_dir", "")) / "profiling_report.json"
    if manifest01.get("paths", {}).get("log_dir") else None,
    Path("logs/profiling_report.json"),
    Path("outputs/logs/profiling_report.json"),
    Path("../logs/profiling_report.json").resolve(),
    Path("../../logs/profiling_report.json").resolve(),
]
profiling_candidates = [p for p in profiling_candidates if p]  # rimuovi None

profiling_path = next((p for p in profiling_candidates if p.exists()), None)
if not profiling_path:
    # ultima spiaggia: cerca nel tree corrente
    matches = list(Path.cwd().resolve().rglob("profiling_report.json"))
    profiling_path = matches[0] if matches else None

def _load_json(p: Path) -> dict:
    if not p or not p.exists(): return {}
    try:
        with open(p, "r", encoding="utf-8") as f: return json.load(f)
    except Exception:
        return {}

reports01 = {
    "quality": _load_json(quality_path),
    "sanity": _load_json(sanity_path),
    "profiling": _load_json(profiling_path) if profiling_path else {},
    "location_drift": _load_json(drift_path),
}

print("Artefatti caricati:", [k for k,v in reports01.items() if v])
print("CWD:", Path.cwd())
print("Profiling path:", profiling_path if profiling_path else "not found")

### Report Loading & Distribution Analysis — 2.2 Distribuzione per location

In [None]:
# 02.2: Distribuzione per location (API top-level) + export
from notebooks.shared.common.reports import DistributionAnalyzer

dist = DistributionAnalyzer(df)
loc_analysis = dist.analyze_location(
    target_weights=(reports01.get("sanity", {}).get("expected_profile", {}) or {}).get("location_weights"),
    tolerance=(reports01.get("sanity", {}).get("expected_profile", {}) or {}).get("location_distribution_tolerance", 0.05),
)

loc_counts = loc_analysis.get("counts", {}) or {}
loc_pcts   = loc_analysis.get("percentages", {}) or {}

loc_df = pd.DataFrame({
    "count": pd.Series(loc_counts, dtype="Int64"),
    "pct":   pd.Series(loc_pcts, dtype="float"),
}).fillna(0).sort_values("count", ascending=False)

display(loc_df.head(20))

loc_csv  = ANALYSIS_DIR / "location_distribution.csv"
loc_parq = ANALYSIS_DIR / "location_distribution.parquet"
loc_df.to_csv(loc_csv, encoding="utf-8")
loc_df.to_parquet(loc_parq)
print(f"Saved: {loc_csv}, {loc_parq}")

### Report Loading & Distribution Analysis — 2.3 Drift summary

In [None]:
# 02.3: Drift summary con fallback robusto
expected_profile = (reports01.get("sanity", {}).get("expected_profile", {}) or {})
expected = expected_profile.get("location_weights", {}) or {}
tolerance = expected_profile.get("location_distribution_tolerance", 0.05)

# Fallback A: prova a usare gli expected salvati nel drift di nb01 (se presente)
if not expected and reports01.get("location_drift"):
    diffs_nb01 = reports01["location_drift"].get("differences", {}) or {}
    expected = {k: float(v.get("expected", 0.0) or 0.0) for k, v in diffs_nb01.items()}

# Fallback B: se ancora vuoto, non facciamo drift check (baseline assente)
if not expected:
    print("ℹ️ Nessun expected_profile disponibile → salto il drift check (baseline mancante).")
    drift_report = {
        "tolerance": tolerance,
        "drifted_locations": [],
        "differences": {},
        "nb01_drift": reports01.get("location_drift", {}),
        "summary": loc_analysis.get("summary", {}),
        "note": "No baseline → drift check skipped",
    }
else:
    obs_pct = loc_analysis.get("percentages", {}) or {}
    all_locs = sorted(set(expected.keys()) | set(obs_pct.keys()))
    drifted, differences = [], {}
    for loc in all_locs:
        exp = float(expected.get(loc, 0.0) or 0.0)
        obs = float(obs_pct.get(loc, 0.0) or 0.0)
        diff = obs - exp
        differences[loc] = {"expected": exp, "observed": obs, "difference": diff}
        if abs(diff) > tolerance:
            drifted.append(loc)
    drift_report = {
        "tolerance": tolerance,
        "drifted_locations": drifted,
        "differences": differences,
        "nb01_drift": reports01.get("location_drift", {}),
        "summary": loc_analysis.get("summary", {}),
    }
    if drifted:
        print("⚠️ Drift oltre soglia per:", drifted)

drift_json = ANALYSIS_DIR / "location_drift_eda.json"
drift_json.write_text(json.dumps(drift_report, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Saved: {drift_json}")

### Report Loading & Distribution Analysis — 2.4 Barplot top locations

In [None]:
# 02.4: Barplot top locations (robusto, senza definire funzioni)
from pathlib import Path
fig_dir = Path("outputs/analysis/figures"); fig_dir.mkdir(parents=True, exist_ok=True)

if 'loc_df' in globals() and isinstance(loc_df, pd.DataFrame) and not loc_df.empty and "count" in loc_df:
    top = loc_df["count"].sort_values(ascending=False).head(12)
elif LOCATION in df.columns:
    top = df[LOCATION].value_counts(dropna=False).head(12)
else:
    top = pd.Series(dtype="int64")

if top.empty:
    print("⚠️ Nessuna location disponibile per il plot.")
else:
    ax = top.plot(kind="bar")
    ax.set_title("Top locations by count")
    ax.set_ylabel("Count")
    ax.set_xlabel("Location")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    plt.savefig(fig_dir / "top_locations_count.png", bbox_inches="tight")
    plt.show()

### Report Loading & Distribution Analysis — 2.5 Salva anche hist prezzi

In [None]:
# 02.5: Istogramma valuation_k (extra rapido)
if VALUATION_K in df:
    df[VALUATION_K].plot(kind="hist", bins=40)
    plt.title("Valuation (k€) distribution"); plt.xlabel("valuation_k"); plt.ylabel("freq")
    _savefig("valuation_hist.png"); plt.show()

### Descriptive Statistics & Distributions — relationship plots

In [None]:
# 03.1: Relationship plots (scatter, regplot, boxplot, heatmap score)
desc = DescriptiveAnalyzer()
fig = desc.create_relationship_plots(df, figsize=(12,10))
_savefig("relationship_plots.png"); plt.show()

### Descriptive Statistics & Distributions — heatmap score


In [None]:
# 03.2: Heatmap correlazioni sugli score
score_cols = [c for c in [CONDITION_SCORE, RISK_SCORE, LUXURY_SCORE, ENV_SCORE] if c in df.columns]
if len(score_cols) >= 2:
    fig, ax = plt.subplots(1,1, figsize=(6,5))
    plot_correlation_heatmap(df, score_cols, ax=ax)
    _savefig("score_correlation_heatmap.png"); plt.show()

### Condition and Risk

In [None]:
# 04: Condition vs Energy class – boxplot dedicato
if CONDITION_SCORE in df and ENERGY_CLASS in df:
    import seaborn as sns  # usato internamente anche da eda_core
    plt.figure(figsize=(8,5))
    sns.boxplot(data=df, x=ENERGY_CLASS, y=CONDITION_SCORE, showfliers=False)
    plt.title("Condition score by Energy class"); plt.xlabel("Energy class"); plt.ylabel("Condition score")
    _savefig("condition_by_energy_boxplot.png"); plt.show()

### Relations

In [None]:
# 05.1: Pearson corr con target
num_df = df.select_dtypes(include=[np.number])
if VALUATION_K in num_df:
    corr = num_df.corr(numeric_only=True)[VALUATION_K].drop(labels=[VALUATION_K], errors="ignore")
    top_pos = corr.sort_values(ascending=False).head(8)
    top_neg = corr.sort_values(ascending=True).head(8)
    top_abs = corr.abs().sort_values(ascending=False).head(12)
    corr_df = pd.DataFrame({"corr": corr})
    display(corr_df.loc[top_abs.index])
    corr_csv = ANALYSIS_DIR / "target_correlations.csv"
    corr_parq = ANALYSIS_DIR / "target_correlations.parquet"
    corr_df.to_csv(corr_csv, encoding="utf-8"); corr_df.to_parquet(corr_parq)
    print(f"Saved: {corr_csv}, {corr_parq}")

### Analisi statistica avanzata

In [None]:
# 06.1: Test statistici (normalità, χ², distribution stats) + summary
tester = StatisticalTester()
stats_results = tester.run_comprehensive_tests(df)
summary_df = pd.DataFrame([stats_results["summary"]])
display(summary_df)

stats_json = ANALYSIS_DIR / "stat_tests_results.json"
stats_json.write_text(json.dumps(stats_results, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Saved: {stats_json}")

### Temporal analysis

In [None]:
# 07: Temporal analysis (refactor + breakdown per location)
from pathlib import Path

# 07.1: Analisi temporale
temp = TemporalAnalyzer()
df_temp, temp_report = temp.analyze(df, target=VALUATION_K)
print(temp_report)

# 07.2: Plot temporali + salvataggio figura
fig = temp.plot(df_temp, target=VALUATION_K)

fig_dir = Path("outputs/analysis/figures")
fig_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(fig_dir / "temporal_analysis.png", bbox_inches="tight")
print(f"📈 Figure saved: {fig_dir / 'temporal_analysis.png'}")
plt.show()

# 07.3: Export report JSON
(Path(ANALYSIS_DIR) / "temporal_report.json").write_text(
    json.dumps(temp_report, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

# 07.4: Breakdown freschezza per location (tabella + barplot)
try:
    if "days_since_verification" in df_temp.columns and LOCATION in df_temp.columns:
        ths = [30, 60, 90]
        grp = df_temp.groupby(LOCATION, observed=True)

        fresk = pd.DataFrame({"count": grp.size()})
        for th in ths:
            flag_col = f"is_stale_{th}d"
            if flag_col in df_temp.columns:
                fresk[f"pct_over_{th}d"] = (grp[flag_col].mean() * 100).astype(float)
            else:
                fresk[f"pct_over_{th}d"] = np.nan

        fresk = fresk.sort_values("pct_over_30d", ascending=False)

        fresk_csv  = ANALYSIS_DIR / "freshness_by_location.csv"
        fresk_parq = ANALYSIS_DIR / "freshness_by_location.parquet"
        fresk.to_csv(fresk_csv, encoding="utf-8")
        fresk.to_parquet(fresk_parq)
        print(f"💾 Saved: {fresk_csv}, {fresk_parq}")

        # barplot % >30d (top 15)
        top = fresk.head(15)
        ax = top["pct_over_30d"].plot(kind="bar")
        ax.set_title("% records >30d per location (top 15)")
        ax.set_ylabel("% >30d")
        ax.set_xlabel("Location")
        plt.xticks(rotation=30, ha="right")
        plt.tight_layout()
        plt.savefig(fig_dir / "freshness_over30_by_location.png", bbox_inches="tight")
        print(f"📈 Figure saved: {fig_dir / 'freshness_over30_by_location.png'}")
        plt.show()
    else:
        print("ℹ️ Breakdown per location non disponibile (manca 'days_since_verification' o 'location').")
except Exception as e:
    print(f"⚠️ Impossibile calcolare il breakdown per location: {e}")

### Insights

In [None]:
# 08: Insights sintetici + export
ins = InsightsAnalyzer(top_n=5)
insights = ins.generate_value_insights(df)

ins_json = ANALYSIS_DIR / "insights.json"
ins_json.write_text(json.dumps(insights, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Saved: {ins_json}")

# anteprime
top_val = insights.get("top_assets", {}).get("by_valuation", {}).get("data", [])
worst = insights.get("worst_assets", {}).get("by_condition", {}).get("data", [])
display(pd.DataFrame(top_val).head(10))
display(pd.DataFrame(worst).head(10))

### Outlier Analysis (IQR)

In [None]:
# 09.1–09.2: Outlier detection (IQR) → combine → stats → export (Parquet/JSON)
from notebooks.shared.n02_explore_dataset.eda_core import DEFAULT_NUMERIC_FEATURES, LEAKY_FEATURES
from pathlib import Path

numeric_cols = [c for c in DEFAULT_NUMERIC_FEATURES if c in df.columns and c not in LEAKY_FEATURES]

od = OutlierDetector(method="iqr", iqr_multiplier=1.5)  # niente output_dir qui
out_summary = od.detect_outliers(df, columns=numeric_cols)

combined = od.combine_outlier_results(df, out_summary)
stats = od.get_outlier_summary_stats(df, out_summary, combined)

out_dir = Path("outputs/analysis/outliers")
out_dir.mkdir(parents=True, exist_ok=True)

# Salvataggi (solo se c’è qualcosa)
(out_dir / "outliers_summary.json").write_text(
    json.dumps(stats, indent=2, ensure_ascii=False, cls=NumpyJSONEncoder),
    encoding="utf-8"
)
if not combined.empty:
    combined.to_parquet(out_dir / "outliers_combined.parquet", index=False)
    combined.to_csv(out_dir / "outliers_combined.csv", index=False, encoding="utf-8")
    display(combined.head(10))
else:
    print("Nessun outlier combinato da salvare.")

### ML Preparation Insights

In [None]:
# 10.1: Prepara features (no leakage) e calcola importances
fia = FeatureImportanceAnalyzer(target_column=VALUATION_K, n_estimators=200, random_state=42)
X, y, feats = fia.prepare_features(df, include_proxies=False)
imps = fia.calculate_importances(X, y, calculate_permutation=True, n_repeats=5)

imp_builtin = imps["builtin"].reset_index(drop=True)
imp_perm = imps.get("permutation", pd.DataFrame()).reset_index(drop=True)

display(imp_builtin.head(15))
if not imp_perm.empty:
    display(imp_perm.head(15))

# export importances
imp_builtin.to_csv(ANALYSIS_DIR / "feature_importances_builtin.csv", index=False)
imp_builtin.to_parquet(ANALYSIS_DIR / "feature_importances_builtin.parquet", index=False)
if not imp_perm.empty:
    imp_perm.to_csv(ANALYSIS_DIR / "feature_importances_permutation.csv", index=False)
    imp_perm.to_parquet(ANALYSIS_DIR / "feature_importances_permutation.parquet", index=False)

# 10.2: Ablation “light” sulle top-5
top_feats = imp_builtin.head(5)["feature"].tolist()
abl = fia.perform_ablation_study(X, y, features_to_ablate=top_feats, cv_folds=3)
display(abl)

abl.to_csv(ANALYSIS_DIR / "ablation_study.csv")
abl.to_parquet(ANALYSIS_DIR / "ablation_study.parquet")

### EDAReportRunner (one-shot + manifest EDA)


In [None]:
runner = EDAReportRunner(output_dir=str(ANALYSIS_DIR))
eda_manifest = runner.run_full_eda(df, save_plots=True, save_tables=True)

print("EDA manifest written:", eda_manifest.get("manifest_path"))