In [1]:
import pandas as pd
import re
from pathlib import Path
import numpy as np

In [2]:
def canonical_family(name: str) -> str:
    """
    Devuelve la 'familia' canónica del modelo a partir del ModelName completo.
    Unifica variantes como LightGBMLarge/LightGBMXT → 'LightGBM',
    RandomForestGini/Entr → 'RandomForest', ExtraTrees* → 'ExtraTrees', etc.
    """
    s = str(name)
    s = re.sub(r'([_-])?(r|v)\d+$', '', s, flags=re.IGNORECASE)
    n = re.sub(r'[^a-z0-9]', '', s.lower())

    if 'catboost' in n:
        return 'CatBoost'
    if 'xgboost' in n or n.startswith('xgb'):
        return 'XGBoost'
    if 'lightgbm' in n or 'lgbm' in n:
        return 'LightGBM'
    if 'randomforest' in n or n.startswith('rf'):
        return 'RandomForest'
    if 'extratrees' in n or 'extratree' in n or n.startswith('et'):
        return 'ExtraTrees'
    if 'kneighbors' in n or n.startswith('knn') or 'knearest' in n:
        return 'KNeighbors'
    if 'logistic' in n or n == 'lr':
        return 'LogisticRegression'
    if 'svm' in n or 'svc' in n:
        return 'SVM'
    if 'decisiontree' in n or n == 'dt':
        return 'DecisionTree'
    m = re.match(r'[a-zA-Z]+', s)
    return m.group(0) if m else s

# Carga de datos

## Cargar datos fuera de las carpetas 

In [3]:
# Ruta base donde están los datos extraídos
BASE = Path("./dataset")

def find_variant_dirs(base: Path):
    cands = {}
    for p in base.rglob("*"):
        if p.is_dir() and p.name in ("ML_nV8", "ML_nV19"):
            cands[p.name] = p
    return cands

variant_dirs = find_variant_dirs(BASE)

re_cfg = re.compile(
    r"ID_2C_NvsSD_nR(?P<nR>\d+)_nV(?P<nV>\d+)_nF(?P<nF>\d+)(?:_Seed_?(?P<seed>\d+))?",
    re.I
)

def parse_meta_from_name(name: str):
    m = re_cfg.search(name)
    meta = {"nR": None, "nV": None, "nF": None, "seed": None}
    if m:
        meta.update({k: (int(v) if v is not None else None) for k, v in m.groupdict().items()})
    return meta

def load_aggregated(variant_path: Path):
    def load_files(pattern):
        dfs = []
        for fp in variant_path.glob(pattern):
            df = pd.read_csv(fp)
            meta = parse_meta_from_name(fp.name)
            for k, v in meta.items():
                if k not in df.columns:  
                    df[k] = v
            dfs.append(df)
        return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
    
    metrics = load_files("Metrics_CV_*.csv")
    preds   = load_files("TestPredCV_*.csv")
    fi      = load_files("FeatureImportance_ID_*.csv")
    return metrics, preds, fi

variant_data = {}
for vname, vpath in variant_dirs.items():
    variant_data[vname] = load_aggregated(vpath)


In [4]:
metrics_v8, preds_v8, fi_v8 = variant_data.get("ML_nV8", (pd.DataFrame(), pd.DataFrame(), pd.DataFrame()))
metrics_v19, preds_v19, fi_v19 = variant_data.get("ML_nV19", (pd.DataFrame(), pd.DataFrame(), pd.DataFrame()))

In [5]:
def collapse_preds(preds_df):
    id_col = 'etiq-id'
    label_col = 'ED_2Clases'
    meta_cols = [c for c in ['nR', 'nV', 'nF'] if c in preds_df.columns]

    proba_cols = [c for c in preds_df.columns if c.startswith('testPredProba')]
    long_parts = []

    for proba_col in proba_cols:
        _, seed_str, model_name = proba_col.split('_', 2)
        seed = int(seed_str[1:])  # quitar la 'S'

        fold_col = f"testNumFold_S{seed}_{model_name}"
        if fold_col in preds_df.columns:
            fold_data = preds_df[fold_col]
        else:
            fold_data = np.nan

        cols_to_take = [id_col, label_col, proba_col]
        part = preds_df[cols_to_take].copy()
        part.columns = ['etiq_id', 'ED_2Clases', 'proba']
        part['fold'] = fold_data
        part['seed'] = seed
        part['model'] = model_name

        for mc in meta_cols:
            part[mc] = preds_df[mc]

        long_parts.append(part)

    preds_long = pd.concat(long_parts, ignore_index=True)
    return preds_long

In [6]:

preds_v8_long = collapse_preds(preds_v8)
preds_v19_long = collapse_preds(preds_v19)

In [7]:
metrics = pd.concat([metrics_v8, metrics_v19], ignore_index=True)
metrics['ModelFamily'] = metrics['ModelName'].apply(canonical_family)
metrics.to_parquet("metrics.parquet", index=False)

label_col = 'ED_2Clases'
preds = pd.concat([preds_v8_long, preds_v19_long], ignore_index=True)
preds.to_parquet("preds.parquet", index=False)

fi = pd.concat([fi_v8, fi_v19], ignore_index=True)
fi['ModelFamily'] = fi['model'].apply(canonical_family)
fi.to_parquet("fi.parquet", index=False)

## Cargar leaderboards dentro de las carpetas

In [8]:
BASE = Path("./dataset")                

pat = re.compile(
    r"ID_2C_NvsSD_"
    r"nR(?P<nR>\d+)_nV(?P<nV>\d+)_nF(?P<nF>\d+)"
    r"_Seed_?(?P<seed>\d+)"
    r"(?:_Fold_?(?P<fold>\d+))?"
    r"(?:_runID_[A-Za-z0-9]+)?$",
    re.I
)

frames = []
for run_dir in BASE.rglob("ID_2C_NvsSD_nR*_nV*_nF*_Seed_*_Fold_*_runID_*"):
    m = pat.match(run_dir.name)
    if not m:
        continue
    meta = {k: (int(v) if v is not None else None) for k, v in m.groupdict().items()}


    for fp in run_dir.glob("leaderboard*.csv"):
        df = pd.read_csv(fp)
        for k in ("nR", "nV", "nF", "seed", "fold"):
            df[k] = meta.get(k)
        frames.append(df)

if not frames:
    raise FileNotFoundError("No se encontraron los leaderboard.")



In [9]:
lb = pd.concat(frames, ignore_index=True)

for c in ("nR","nV","nF","seed","fold"):
    if c in lb.columns:
        lb[c] = lb[c].astype("Int64")

lb.to_parquet("leaderboards.parquet", index=False)