In [None]:
"""
Notebook section: 1.3.11 Preliminary dataset preparation
Team 102D · AB Data Challenge — Iteration 2

Inputs
  • results/iteration_2/features_v2.csv (or .parquet)
  • results/iteration_2/selected_features_v1.txt (from 1.3.10)

This script:
  1) Loads features and the first‑pass selected feature list
  2) Chooses identifier + label columns (y_anom)
  3) Train/valid/test split (70/15/15) with Group-aware splitting by meter when available
  4) Preprocessing artifacts (median imputer + standard scaler stats)
  5) Class balance summary + suggested class weights
  6) Saves ready-to-model tables and metadata

Outputs (results/iteration_2/prep_1_3_11/)
  • split_assignments.csv  — row-wise split label (train/valid/test)
  • Xy_train.parquet / Xy_valid.parquet / Xy_test.parquet  — selected features + y
  • preprocess_imputer_medians.json, preprocess_scaler_stats.json
  • class_balance.json, prep_summary.json, selected_features_resolved.txt
  • columns_manifest.json (feature columns, id, label)

Notes
  • If y is missing everywhere, uses unsupervised random split.
  • Keeps id/time/label columns alongside X to ease later joins.
"""

# === Imports & setup ===
import os, json, warnings
import numpy as np
import pandas as pd
from typing import List

from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer

warnings.filterwarnings("ignore")

# === Paths ===
PROJECT_ROOT = "."
RESULTS_DIR = os.path.join(PROJECT_ROOT, "results", "iteration_2")
PREP_DIR = os.path.join(RESULTS_DIR, "prep_1_3_11")
os.makedirs(PREP_DIR, exist_ok=True)

FEATS_CSV = os.path.join(RESULTS_DIR, "features_v2.csv")
FEATS_PQ  = os.path.join(RESULTS_DIR, "features_v2.parquet")
SEL_TXT   = os.path.join(RESULTS_DIR, "selected_features_v1.txt")

# === Load features ===
if os.path.exists(FEATS_CSV):
    df = pd.read_csv(FEATS_CSV, low_memory=False)
elif os.path.exists(FEATS_PQ):
    df = pd.read_parquet(FEATS_PQ)
else:
    raise SystemExit("features_v2 not found under results/iteration_2.")

print("Loaded features:", df.shape)

# === Identify columns ===
ID_CANDS = ["num_serie_contador", "polissa_id"]
TIME_COLS = ["datetime","date","data_inici","data_fi","year","month","dayofweek","hour"]
LABEL = "y_anom" if "y_anom" in df.columns else None

id_col = next((c for c in ID_CANDS if c in df.columns), None)

# Selected features from 1.3.10
if os.path.exists(SEL_TXT):
    with open(SEL_TXT, "r", encoding="utf-8") as f:
        selected = [ln.strip() for ln in f if ln.strip() and ln.strip() in df.columns]
else:
    # Fallback: take numeric columns not including ids/time/label
    exclude = set(ID_CANDS + TIME_COLS + ([LABEL] if LABEL else []) + ["codi_anomalia"])
    selected = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in exclude]

# Persist the resolved list
with open(os.path.join(PREP_DIR, "selected_features_resolved.txt"), "w") as f:
    for c in selected:
        f.write(c + "\n")

# === Build the modeling table (X + y + id/time for context) ===
keep_cols = list(dict.fromkeys((selected + ([LABEL] if LABEL else []) + ID_CANDS + TIME_COLS)))
keep_cols = [c for c in keep_cols if c in df.columns]
mod = df[keep_cols].copy()

# Ensure numeric dtype on features
for c in selected:
    mod[c] = pd.to_numeric(mod[c], errors="coerce")

# === Split: 70/15/15 ===
N = len(mod)
split_assign = pd.Series(index=mod.index, dtype="string")

has_label = (LABEL is not None) and mod[LABEL].notna().any() and mod[LABEL].nunique() >= 2

if id_col is not None:
    # Group-aware: split on groups
    gss1 = GroupShuffleSplit(n_splits=1, train_size=0.70, random_state=42)
    tr_idx, hold_idx = next(gss1.split(mod, groups=mod[id_col]))
    # valid/test from holdout
    hold = mod.iloc[hold_idx]
    gss2 = GroupShuffleSplit(n_splits=1, train_size=0.50, random_state=43)
    va_idx_rel, te_idx_rel = next(gss2.split(hold, groups=hold[id_col]))
    va_idx = hold.index[va_idx_rel]
    te_idx = hold.index[te_idx_rel]
else:
    if has_label:
        sss1 = StratifiedShuffleSplit(n_splits=1, train_size=0.70, random_state=42)
        tr_idx, hold_idx = next(sss1.split(mod, mod[LABEL].fillna(0)))
        sss2 = StratifiedShuffleSplit(n_splits=1, train_size=0.50, random_state=43)
        va_rel, te_rel = next(sss2.split(mod.iloc[hold_idx], mod.iloc[hold_idx][LABEL].fillna(0)))
        va_idx = mod.iloc[hold_idx].index[va_rel]
        te_idx = mod.iloc[hold_idx].index[te_rel]
    else:
        rng = np.random.default_rng(42)
        idx = np.arange(N)
        rng.shuffle(idx)
        tr_cut = int(0.70*N)
        va_cut = tr_cut + int(0.15*N)
        tr_idx, va_idx, te_idx = idx[:tr_cut], idx[tr_cut:va_cut], idx[va_cut:]

split_assign.loc[tr_idx] = "train"
split_assign.loc[va_idx] = "valid"
split_assign.loc[te_idx] = "test"

# === Preprocessing: imputer medians + (optional) scaler stats ===
imp = SimpleImputer(strategy="median")
medians = pd.Series(imp.fit(mod[selected]).statistics_, index=selected)

# compute scaler stats (mean/std) on TRAIN ONLY to avoid leakage
train_means = mod.loc[split_assign == "train", selected].mean(numeric_only=True)
train_stds  = mod.loc[split_assign == "train", selected].std(numeric_only=True).replace(0, np.nan)

# Persist preprocess artifacts
medians.to_json(os.path.join(PREP_DIR, "preprocess_imputer_medians.json"))
(pd.DataFrame({"mean": train_means, "std": train_stds})
   .to_json(os.path.join(PREP_DIR, "preprocess_scaler_stats.json"), orient="table"))

# === Class balance summary ===
if has_label:
    vc = mod.loc[split_assign == "train", LABEL].value_counts(dropna=False).to_dict()
    n0 = int(vc.get(0, 0)); n1 = int(vc.get(1, 0))
    # Suggested weights (sklearn's 'balanced' formula)
    total = n0 + n1 if (n0 + n1) > 0 else 1
    w0 = total / (2 * max(n0, 1))
    w1 = total / (2 * max(n1, 1))
    class_balance = {"train_counts": {"0": n0, "1": n1}, "suggested_weights": {"0": w0, "1": w1}}
else:
    class_balance = {"note": "Label unavailable or single-class; skipping class balance."}

with open(os.path.join(PREP_DIR, "class_balance.json"), "w") as f:
    json.dump(class_balance, f, indent=2)

# === Write split assignments & datasets ===
assign_df = pd.DataFrame({"row": mod.index, "split": split_assign.values})
assign_df.to_csv(os.path.join(PREP_DIR, "split_assignments.csv"), index=False)

# Save each split with id/time + y + features
cols_out = list(dict.fromkeys((ID_CANDS + TIME_COLS + ([LABEL] if LABEL else []) + selected)))
cols_out = [c for c in cols_out if c in mod.columns]

for name, idx in ("train", tr_idx), ("valid", va_idx), ("test", te_idx):
    part = mod.loc[idx, cols_out]
    part.to_parquet(os.path.join(PREP_DIR, f"Xy_{name}.parquet"), index=False)

# === Columns manifest & prep summary ===
manifest = {"features": selected, "label": LABEL, "id_col": id_col, "time_cols": [c for c in TIME_COLS if c in mod.columns]}
with open(os.path.join(PREP_DIR, "columns_manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)

summary = {
    "rows": int(len(mod)),
    "n_features": int(len(selected)),
    "has_label": bool(has_label),
    "splits": assign_df["split"].value_counts(dropna=False).to_dict(),
    "paths": {
        "train": "Xy_train.parquet",
        "valid": "Xy_valid.parquet",
        "test":  "Xy_test.parquet",
    }
}
with open(os.path.join(PREP_DIR, "prep_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("Saved to:", PREP_DIR)
print("Done.")
