In [23]:
# LOO étendue : classement d'influence des villes sur T_log pour fenêtres prioritaires
import os, re, glob, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Config
PRIO_SUM = 'results/temporal_cv_atypical_windows/diagnostics/tests_priority_summary.csv'
IDX_ENR = 'results/temporal_cv_atypical_windows/temporal_cv_atypical_periods_windows_index_enriched.csv'
OUT_DIR = 'results/temporal_cv_atypical_windows/diagnostics/loo_influence'
TOPN_WINDOWS = None   # None = traiter toutes les lignes de PRIO_SUM; sinon int
FEATURES_DEFAULT = ['temperature_celsius','humidity_percent','precipitation_mm','wind_speed_ms','urban_heat_island_intensity']
os.makedirs(OUT_DIR, exist_ok=True)

def safe_tag(s): return re.sub(r'[^0-9A-Za-z_.-]', '_', str(s))

def compute_d_estimate_from_X(X):
    if X.shape[0] < 2:
        return np.nan, np.nan, np.nan
    cov = np.cov(X, rowvar=False)
    eigvals = np.linalg.eigvalsh(cov)
    eigvals = np.maximum(eigvals, 0.0)
    sum_eig = np.sum(eigvals)
    d_part = 0.0 if sum_eig <= 0 else (sum_eig**2) / np.sum(eigvals**2)
    pca = PCA(n_components=min(X.shape[0], X.shape[1])).fit(X)
    cumvar = np.cumsum(pca.explained_variance_ratio_)
    d_pca90 = int(np.searchsorted(cumvar, 0.90) + 1) if cumvar[-1] >= 0.90 else pca.n_components_
    d_est = float((d_part + d_pca90) / 2.0)
    return d_part, d_pca90, d_est

def compute_T_log(d_est, n_eff):
    n_eff = max(2, int(n_eff))
    return (d_est - 4.0) * np.log(n_eff)

# Load priorities
if not os.path.exists(PRIO_SUM):
    raise FileNotFoundError(PRIO_SUM)
prio = pd.read_csv(PRIO_SUM)
if TOPN_WINDOWS:
    prio = prio.sort_values(prio.columns[0]).head(TOPN_WINDOWS)

influence_rows = []
for _, prow in prio.iterrows():
    tag = prow['tag']
    window_type = prow.get('window_type')
    window_center = prow.get('window_center')
    pipeline = prow.get('pipeline')
    # locate raw file (try index enriched then heuristics)
    raw_fp = None
    if os.path.exists(IDX_ENR):
        idx = pd.read_csv(IDX_ENR)
        # match best-effort (handle numeric string)
        cond = (idx['window_type']==window_type) & (idx['pipeline']==pipeline)
        try:
            wcf = float(window_center)
            cond = cond & (idx['window_center'].astype(float)==wcf)
        except Exception:
            cond = cond & (idx['window_center'].astype(str)==str(window_center))
        match = idx[cond]
        if not match.empty and 'raw_csv' in match.columns:
            raw_fp = match.iloc[0]['raw_csv']
    if raw_fp is None:
        candidates = glob.glob(os.path.join('results','temporal_cv_atypical_windows',f'*{tag}*raw.csv'))
        raw_fp = candidates[0] if candidates else None
    if not raw_fp or not os.path.exists(raw_fp):
        print(f"SKIP no raw for {tag}")
        continue

    df = pd.read_csv(raw_fp)
    if 'city_key' not in df.columns:
        agg_cols = [c for c in ['city','country','latitude','longitude'] if c in df.columns]
        df['city_key'] = df[agg_cols].astype(str).agg('_'.join, axis=1) if agg_cols else 'unknown'

    # select features
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in ['year','month','n_rows']]
    features = [c for c in FEATURES_DEFAULT if c in num_cols] or (num_cols[:min(5,len(num_cols))] if num_cols else [])
    if not features:
        print(f"SKIP no numeric features for {tag}")
        continue

    # per-city aggregate (mean)
    city_df = df.groupby('city_key')[features].mean().reset_index()
    city_df_clean = city_df.dropna(subset=features).reset_index(drop=True)
    if city_df_clean.shape[0] < 2:
        print(f"SKIP not enough cities for {tag}")
        continue

    scaler = StandardScaler()
    X = scaler.fit_transform(city_df_clean[features].astype(float).values)
    d_part, d_pca90, d_est = compute_d_estimate_from_X(X)
    T_full = compute_T_log(d_est, X.shape[0])

    # LOO over cities
    loo_records = []
    for i in range(X.shape[0]):
        city_left = city_df_clean.loc[i,'city_key']
        X_loo = np.delete(X, i, axis=0)
        if X_loo.shape[0] < 2:
            continue
        d_part_loo, d_pca90_loo, d_est_loo = compute_d_estimate_from_X(X_loo)
        T_loo = compute_T_log(d_est_loo, X_loo.shape[0])
        delta = T_loo - T_full
        loo_records.append({'city': city_left, 'T_loo': float(T_loo), 'delta': float(delta),
                            'd_part_loo': float(d_part_loo), 'd_pca90_loo': int(d_pca90_loo),'d_est_loo': float(d_est_loo)})
    if not loo_records:
        continue
    loo_df = pd.DataFrame(loo_records).sort_values('delta')  # negative delta means removing city reduces T_log
    # rank by absolute influence
    loo_df['delta_abs'] = loo_df['delta'].abs()
    loo_df = loo_df.sort_values('delta_abs', ascending=False).reset_index(drop=True)

    # record top influencers
    top5 = loo_df.head(5)
    for rank, r in top5.iterrows():
        influence_rows.append({
            'tag': tag, 'window_type': window_type, 'window_center': window_center, 'pipeline': pipeline,
            'raw_csv': raw_fp, 'n_cities': X.shape[0], 'feature_list': ';'.join(features),
            'influencer_rank': rank+1, 'city': r['city'], 'delta_T_log': r['delta'], 'delta_abs': r['delta_abs'],
            'T_full': float(T_full), 'T_without_city': r['T_loo'],
            'd_est_full': float(d_est), 'd_est_without_city': r['d_est_loo']
        })

    # save LOO full table and plot per-window
    window_out = os.path.join(OUT_DIR, safe_tag(tag))
    os.makedirs(window_out, exist_ok=True)
    loo_df.to_csv(os.path.join(window_out, f'{safe_tag(tag)}_loo_by_city.csv'), index=False)

    # plot LOO distribution
    plt.figure(figsize=(6,3))
    plt.hist(loo_df['T_loo'], bins=max(8,min(20,loo_df.shape[0])), color='C0', alpha=0.8)
    plt.axvline(T_full, color='red', linestyle='--', label='T_full')
    plt.xlabel('T_log (leave-one-out)')
    plt.ylabel('count')
    plt.title(f'LOO T_log distribution - {tag}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(window_out, f'{safe_tag(tag)}_loo_hist.png'), dpi=150)
    plt.close()

    # bar plot top deltas
    plt.figure(figsize=(6,3))
    top_plot = loo_df.sort_values('delta_abs', ascending=False).head(10)
    plt.barh(top_plot['city'].astype(str), top_plot['delta'], color='C1')
    plt.xlabel('delta T_log (T_without_city - T_full)')
    plt.title(f'Top city influence (abs delta) - {tag}')
    plt.tight_layout()
    plt.savefig(os.path.join(window_out, f'{safe_tag(tag)}_top_influence.png'), dpi=150)
    plt.close()

# write global ranking CSV
out_csv = os.path.join(OUT_DIR, 'loo_influence_top5_per_window.csv')
pd.DataFrame(influence_rows).to_csv(out_csv, index=False)
print("LOO influence results written to:", out_csv)
print("Per-window outputs under:", OUT_DIR)


LOO influence results written to: results/temporal_cv_atypical_windows/diagnostics/loo_influence\loo_influence_top5_per_window.csv
Per-window outputs under: results/temporal_cv_atypical_windows/diagnostics/loo_influence


### Résumé rapide des influences LOO

- Un petit groupe de villes réapparaît comme influenceur majeur : **Delhi, Beijing, Chicago, San Jose, Mexico City, Lagos, Dallas, New York, Phoenix, Mumbai**.  
- Pattern d’influence : Delhi et Beijing produisent les plus grands changements absolus de T_log en LOO ; Chicago et San Jose sont souvent 2–3 ; Mexico City et Lagos apparaissent comme influenceurs positifs sur plusieurs fenêtres.  
- Magnitudes observées : |delta_T_log| pour les top influencers varie ≈ 0.09–0.39, les plus fréquentes étant dans la plage 0.10–0.30.

---

### Interprétation synthétique

- Quelques villes concentrent la sensibilité du diagnostic global d / T_log, ce qui remet en question la robustesse du diagnostic si ces villes ne sont pas traitées explicitement.  
- Les features qui ressortent fréquemment comme sources d’outliers sont **precipitation_mm** et **urban_heat_island_intensity**.  
- L’influence est directionnelle : certaines villes tirent T_log vers le haut (delta positif), d’autres vers le bas (delta négatif), indiquant des distributions locales systématiquement différentes.

---

### Vérifications immédiates recommandées

1. Calculer pour les villes récurrentes (Delhi, Beijing, Chicago, San Jose, Mexico City, Lagos) : Mahalanobis robuste (MCD) et Z‑scores par feature.  
2. Tracer séries temporelles des features coupables (precipitation_mm, urban_heat_island_intensity, temperature_celsius, humidity_percent, wind_speed_ms) pour détecter si l’influence vient de quelques points extrêmes ou d’un décalage soutenu.  
3. Appliquer localement (par ville) des transformations alternatives et comparer : winsorisation (p.ex. 1–99e centile), log1p(precipitation_mm), clipping UHI ; mesurer l’impact sur T_log.  
4. Vérifier la dépendance à la pipeline : répéter LOO sous la pipeline cible (A, B ou C) pour confirmer que les influencers persistent.

---

### Stratégies d’atténuation classées

1. Pré‑traitement localisé par ville : winsoriser ou transformer la/les features responsables pour réduire l’effet de levier sans exclure la ville.  
2. Agrégation robuste / quorum : estimer T_log via estimateurs robustes (trimmed mean, median-of-estimates, bootstrap consensus) et exiger accord entre estimateurs avant décision opérationnelle.  
3. Politique flag‑and‑review : flag automatique des fenêtres où |delta_T_log| > seuil (p.ex. 0.10) avec paquet diagnostic (timeseries + per‑feature outlier summary) pour relecture humaine.  
4. Exclusion conditionnelle : n’exclure qu’après règles diagnostiques (Mahalanobis extrême + transform‑insensible) confirmant que l’influence est un artefact, pas une représentativité légitime.

---


Cellule (a) — tableau résumé des 5 villes (signal brut et distribution)

Résumé rapide de l'objectif de la cellule

Charge le CSV brut de la fenêtre atypique sélectionnée.

Filtre les observations des 5 villes proposées et affiche le signal brut (toutes observations conservées).

Calcule et affiche un tableau résumé (count, mean, std, median, min, max, skew) pour chaque feature et chaque ville.

Sauvegarde le tableau résumé et le long format des signaux bruts pour envoi / revue externe.

In [24]:
# Cellule (a) — afficher le signal brut et un tableau résumé pour 5 villes
# Modifier `RAW_CSV` et `TOP5_CITIES` si nécessaire.

import pandas as pd

# Chemin CSV de la fenêtre atypique que vous voulez inspecter
RAW_CSV = "results/temporal_cv_atypical_windows/temporal_cv_atypical_period_window_3yr_1984.00_A_raw.csv"

# Liste des 5 villes proposées (format attendu dans la colonne 'city' du CSV)
TOP5_CITIES = [
    "Delhi_India_28.7041_77.1025",
    "Chicago_USA_41.8781_-87.6298",
    "San Jose_USA_37.3382_-121.8863",
    "Mumbai_India_19.076_72.8777",
    "Beijing_China_39.9042_116.4074"
]

# Colonnes de signal (conserver la distribution brute)
FEATURE_COLS = [
    "temperature_celsius",
    "humidity_percent",
    "precipitation_mm",
    "wind_speed_ms",
    "urban_heat_island_intensity"
]

# Chargement et filtrage
df = pd.read_csv(RAW_CSV)
df_top5 = df[df["city"].isin(TOP5_CITIES)].copy()

# Afficher quelques lignes brutes (signal brut conservé)
print("Aperçu du signal brut pour les 5 villes (10 premières lignes) :")
display(df_top5.head(10))

# Tableau résumé par ville (statistiques descriptives sur la distribution brute)
summary_by_city = (
    df_top5
    .groupby("city")[FEATURE_COLS]
    .agg([
        ("count", "count"),
        ("mean", "mean"),
        ("std", "std"),
        ("median", "median"),
        ("min", "min"),
        ("max", "max"),
        ("skew", pd.Series.skew)
    ])
)

# Normaliser la mise en forme des colonnes (une ligne par statistique pour lisibilité)
summary_by_city.columns = ["_".join(col).strip() for col in summary_by_city.columns.values]
summary_by_city = summary_by_city.reset_index()

print("\nTableau résumé par ville (statistiques descriptives, distribution brute conservée) :")
display(summary_by_city)

# Optionnel : sauvegarder les sorties pour review externe
summary_by_city.to_csv("results/top5_cities_raw_distribution_summary.csv", index=False)
df_top5.to_csv("results/top5_cities_raw_signals_long.csv", index=False)

print("\nFichiers sauvegardés :")
print(" - results/top5_cities_raw_distribution_summary.csv")
print(" - results/top5_cities_raw_signals_long.csv")


Aperçu du signal brut pour les 5 villes (10 premières lignes) :


Unnamed: 0,city,country,latitude,longitude,year,month,temperature_celsius,humidity_percent,precipitation_mm,wind_speed_ms,urban_heat_island_intensity,data_source,ym,city_key



Tableau résumé par ville (statistiques descriptives, distribution brute conservée) :


Unnamed: 0,city,temperature_celsius_count,temperature_celsius_mean,temperature_celsius_std,temperature_celsius_median,temperature_celsius_min,temperature_celsius_max,temperature_celsius_skew,humidity_percent_count,humidity_percent_mean,...,wind_speed_ms_min,wind_speed_ms_max,wind_speed_ms_skew,urban_heat_island_intensity_count,urban_heat_island_intensity_mean,urban_heat_island_intensity_std,urban_heat_island_intensity_median,urban_heat_island_intensity_min,urban_heat_island_intensity_max,urban_heat_island_intensity_skew



Fichiers sauvegardés :
 - results/top5_cities_raw_distribution_summary.csv
 - results/top5_cities_raw_signals_long.csv


In [25]:
# Générer le tableau résumé top‑5 à partir du CSV LOO existant (chemin corrigé)
import os
import pandas as pd

# Chemin corrigé vers le CSV LOO trouvé dans le notebook
LOO_TOP5_CSV = "results/temporal_cv_atypical_windows/diagnostics/loo_influence/loo_influence_top5_per_window.csv"
OUT_SUMMARY = "results/top5_loo_summary.csv"

if not os.path.exists(LOO_TOP5_CSV):
    raise FileNotFoundError(f"Fichier LOO introuvable: {LOO_TOP5_CSV}")

loo = pd.read_csv(LOO_TOP5_CSV)

# Agréger par ville et calculer métriques d'influence
group = loo.groupby("city")
summary = group.agg(
    n_windows=("city", "count"),
    mean_delta_abs=("delta_abs", "mean"),
    median_delta_abs=("delta_abs", "median"),
    max_delta_abs=("delta_abs", "max"),
    mean_delta_signed=("delta_T_log", "mean")
).reset_index()

# Proportion de fenêtres couvertes (en %)
n_windows_total = loo["window_center"].astype(str).nunique()
summary["prop_windows_pct"] = (summary["n_windows"] / max(1, n_windows_total)) * 100

# Trier et prendre top 5 par impact moyen absolu
summary = summary.sort_values("mean_delta_abs", ascending=False).reset_index(drop=True)
top5_summary = summary.head(5).copy()

# Colonnes finales lisibles
top5_summary = top5_summary[[
    "city",
    "n_windows",
    "prop_windows_pct",
    "mean_delta_abs",
    "median_delta_abs",
    "max_delta_abs",
    "mean_delta_signed"
]]

os.makedirs(os.path.dirname(OUT_SUMMARY), exist_ok=True)
top5_summary.to_csv(OUT_SUMMARY, index=False)

print("Top 5 villes (résumé LOO) — sauvegardé sous :", OUT_SUMMARY)
display(top5_summary)


Top 5 villes (résumé LOO) — sauvegardé sous : results/top5_loo_summary.csv


Unnamed: 0,city,n_windows,prop_windows_pct,mean_delta_abs,median_delta_abs,max_delta_abs,mean_delta_signed
0,New York_USA_40.7128_-74.006,6,100.0,0.354811,0.354811,0.367811,-0.354811
1,Tokyo_Japan_35.6762_139.6503,3,50.0,0.31051,0.31051,0.31051,-0.31051
2,Delhi_India_28.7041_77.1025,14,233.333333,0.281196,0.303564,0.390403,-0.281196
3,Dallas_USA_32.7767_-96.797,7,116.666667,0.238337,0.246162,0.27645,-0.238337
4,Chicago_USA_41.8781_-87.6298,11,183.333333,0.227382,0.205147,0.41294,-0.227382


### Résumé concis des résultats top‑5 LOO (fichier top5_loo_summary.csv)

- **Classement** : New York, Tokyo, Delhi, Dallas, Chicago sont les 5 villes avec le plus fort impact moyen absolu sur T_log.  
- **Amplitude** : New York affiche le plus fort impact moyen |delta| ≈ **0.355** (max 0.368), Tokyo ≈ **0.311**, Delhi ≈ **0.281**, Dallas ≈ **0.238**, Chicago ≈ **0.227**.  
- **Fréquence d’apparition** : Delhi et Chicago apparaissent dans beaucoup de fenêtres (Delhi 14 apparitions, Chicago 11), New York apparaît sur toutes les fenêtres considérées ici (n_windows = 6, prop = 100%).  
- **Direction moyenne** : toutes ces villes ont une mean_delta_signed négative, donc leur suppression fait baisser T_log (elles tirent T_log vers le haut quand elles sont incluses).  
- **Interprétation rapide** : ces villes exercent un levier substantiel sur la classification de régime; leur signal semble systématique (répété sur plusieurs fenêtres) plutôt que ponctuel pour au moins Delhi et Chicago.


Cell prête à exécution — diagnostics bruts pour le top‑5 (sauvegarde dans results/diagnostics_top5/)



In [26]:
# Générer diagnostics bruts pour les 5 villes du top5_loo_summary.csv
# Sortie: results/diagnostics_top5/{city_key}/ :
#   - summary_{city_key}.csv (mini résumé LOO + stats feature)
#   - timeseries_{city_key}.png (séries temporelles des features)
#   - signals_{city_key}.csv (observations brutes conservées)
#   - robust_mahalanobis_{city_key}.csv (Mahalanobis robuste par observation)
#   - zscores_{city_key}.csv (Z-scores par feature)
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.covariance import MinCovDet
from datetime import datetime

# Config chemins
TOP5_CSV = "results/top5_loo_summary.csv"
DATA_CSV = "data/urban_climate.csv"
OUT_DIR = Path("results/diagnostics_top5")
FEATURE_COLS = [
    "temperature_celsius",
    "humidity_percent",
    "precipitation_mm",
    "wind_speed_ms",
    "urban_heat_island_intensity"
]

# Vérifications rapides
for p in (TOP5_CSV, DATA_CSV):
    if not Path(p).exists():
        raise FileNotFoundError(f"Fichier introuvable: {p}")

os.makedirs(OUT_DIR, exist_ok=True)

# Charger top5 et données brutes
top5 = pd.read_csv(TOP5_CSV)
df = pd.read_csv(DATA_CSV)

# Normaliser colonne city_key si besoin
if "city_key" not in df.columns:
    df["city_key"] = df[["city", "country", "latitude", "longitude"]].astype(str).agg("_".join, axis=1)

# Extraire liste de villes (city strings) depuis top5
cities = top5["city"].astype(str).tolist()

# Fonctions utilitaires
def robust_mahalanobis(X):
    # X: numpy array (n x p) without NaNs
    mcd = MinCovDet().fit(X)
    md = mcd.mahalanobis(X)  # squared Mahalanobis distances
    return md, mcd.location_, mcd.covariance_

def safe_zscores(X):
    mu = np.nanmedian(X, axis=0)
    mad = np.nanmedian(np.abs(X - mu), axis=0)
    # convert MAD to approx std: 1.4826*mad (robust)
    robust_std = 1.4826 * mad
    with np.errstate(invalid="ignore", divide="ignore"):
        z = (X - mu) / robust_std
    return z, mu, robust_std

# Pour chaque ville : filtrer, calculer, sauvegarder et tracer
for city in cities:
    city_dir = OUT_DIR / city
    os.makedirs(city_dir, exist_ok=True)

    # Filtrer observations brutes pour la ville (toutes obs conservées)
    city_df = df[df["city_key"].astype(str) == city].copy()
    # If no rows, still create placeholders
    if city_df.shape[0] == 0:
        # create empty placeholder files so reviewer sees missing data
        pd.DataFrame().to_csv(city_dir / f"signals_{city}.csv", index=False)
        pd.DataFrame().to_csv(city_dir / f"summary_{city}.csv", index=False)
        continue

    # Keep original columns of interest and temporal ordering
    # create ym if absent
    if "ym" not in city_df.columns:
        city_df["ym"] = city_df.apply(lambda r: f"{int(r['year']):04d}-{int(r['month']):02d}" if not np.isnan(r['year']) and not np.isnan(r['month']) else "", axis=1)

    signals_out = city_dir / f"signals_{city}.csv"
    city_df.to_csv(signals_out, index=False)

    # Per-feature descriptive stats (brute distribution)
    stats = city_df[FEATURE_COLS].agg(["count", "mean", "std", "median", "min", "max", "skew"]).transpose()
    stats = stats.reset_index().rename(columns={"index":"feature"})
    # Add simple LOO summary if available in top5_loo_summary or loo_influence file
    loo_row = top5[top5["city"]==city]
    if not loo_row.empty:
        loo_meta = loo_row.iloc[0].to_dict()
    else:
        loo_meta = {}

    # Robust Mahalanobis: drop rows with NaN in FEATURE_COLS for robust fit
    X = city_df[FEATURE_COLS].to_numpy(dtype=float)
    valid_mask = ~np.isnan(X).any(axis=1)
    md = np.array([])
    mcd_loc = None
    mcd_cov = None
    if valid_mask.sum() >= (len(FEATURE_COLS) + 1):
        try:
            md, mcd_loc, mcd_cov = robust_mahalanobis(X[valid_mask])
        except Exception:
            md = np.array([])
    # Prepare mahalanobis DF aligned to original rows (NaN where invalid)
    md_full = np.full(shape=(city_df.shape[0],), fill_value=np.nan)
    if md.size:
        md_full[valid_mask] = md
    mahal_df = pd.DataFrame({
        "ym": city_df.get("ym", pd.Series([""]*len(city_df))),
        "mahalanobis_sq": md_full
    })
    mahal_df.to_csv(city_dir / f"robust_mahalanobis_{city}.csv", index=False)

    # Robust Z-scores (median/MAD)
    z, med, rstd = safe_zscores(X)
    z_full = np.full_like(X, np.nan)
    z_full[~np.isnan(X).any(axis=1)] = z[~np.isnan(X).any(axis=1)]
    z_df = pd.DataFrame(z_full, columns=FEATURE_COLS)
    z_df.insert(0, "ym", city_df.get("ym", pd.Series([""]*len(city_df))))
    z_df.to_csv(city_dir / f"zscores_{city}.csv", index=False)

    # Compose summary CSV (combined)
    summary_rows = []
    # per-feature summary lines
    for _, row in stats.iterrows():
        summary_rows.append({
            "city": city,
            "feature": row["feature"],
            "count": row["count"],
            "mean": row["mean"],
            "std": row["std"],
            "median": row["median"],
            "min": row["min"],
            "max": row["max"],
            "skew": row["skew"]
        })
    summary_df = pd.DataFrame(summary_rows)
    # Add a top-level metadata row (one-line) appended at top
    meta = {
        "city": city,
        "feature": "meta",
        "count": city_df.shape[0],
        "mean": loo_meta.get("mean_delta_abs", np.nan),
        "std": loo_meta.get("median_delta_abs", np.nan),
        "median": loo_meta.get("max_delta_abs", np.nan),
        "min": loo_meta.get("n_windows", np.nan),
        "max": loo_meta.get("prop_windows_pct", np.nan),
        "skew": loo_meta.get("mean_delta_signed", np.nan)
    }
    summary_out = city_dir / f"summary_{city}.csv"
    pd.concat([pd.DataFrame([meta]), summary_df], ignore_index=True).to_csv(summary_out, index=False)

    # Timeseries plot (raw values per feature)
    fig, ax = plt.subplots(len(FEATURE_COLS), 1, figsize=(10, 2.2 * len(FEATURE_COLS)), sharex=True)
    if len(FEATURE_COLS) == 1:
        ax = [ax]
    # temporal order by year-month if available, else index
    if "year" in city_df.columns and "month" in city_df.columns and not city_df["year"].isna().all():
        city_df["__time"] = pd.to_datetime(city_df["year"].astype(int).astype(str) + "-" + city_df["month"].astype(int).astype(str) + "-01", errors="coerce")
        city_df = city_df.sort_values("__time")
        x = city_df["__time"]
    else:
        city_df = city_df.reset_index(drop=True)
        x = city_df.index
    for i, f in enumerate(FEATURE_COLS):
        ax[i].plot(x, city_df[f], marker=".", linestyle="-", alpha=0.8)
        ax[i].set_ylabel(f)
        ax[i].grid(True, alpha=0.3)
    title = f"{city} — signal brut ({city_df.shape[0]} obs)"
    fig.suptitle(title)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plot_path = city_dir / f"timeseries_{city}.png"
    fig.savefig(plot_path, dpi=150)
    plt.close(fig)

    # Save a short meta README
    with open(city_dir / "README.txt", "w", encoding="utf-8") as f:
        f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")
        f.write(f"City key: {city}\n")
        f.write(f"Observations saved: {signals_out.name}\n")
        f.write(f"Summary saved: {summary_out.name}\n")
        f.write(f"Mahalanobis saved: robust_mahalanobis_{city}.csv\n")
        f.write(f"Zscores saved: zscores_{city}.csv\n")
        if mcd_loc is not None:
            f.write("Mahalanobis model: robust location and covariance available\n")
        else:
            f.write("Mahalanobis model: not fitted (insufficient valid rows) or error\n")

print("Diagnostics générés pour top5 — dossier :", OUT_DIR)


  f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")
  f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")
  f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")
  f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")


Diagnostics générés pour top5 — dossier : results\diagnostics_top5


  f.write(f"Diagnostics generated: {datetime.utcnow().isoformat()}Z\n")
