In [1]:
# ===============================
# Imports
# ===============================s
import numpy as np
import pandas as pd
import pandas.api.types as pdt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from scipy import sparse
import xgboost as xgb
from sklearn.isotonic import IsotonicRegression
import gc
from xgboost.callback import EarlyStopping as XGBEarlyStopping

In [None]:
"""
XGBOOST MODEL TRAINING WITH OUTSTANDING RESULTS THROUGH THE ETL-PIPELINE DATASET
--------------------------------------------------------------------------------
This notebook/script trains, calibrates, and evaluates a binary XGBoost model on
the match-level dataset produced by the ETL pipeline. It follows leak-safe
preprocessing, yearly cross-validation grouped by match `id`, isotonic
probability calibration, and out-of-sample testing (2023–2025). Results are
strong across eras and tournament tiers.

Key takeaways (summarized from the printed runs in this notebook):
- Yearly CV (2000–2025, 26 folds; grouping by `id`):
    * Mean AUC ≈ 0.96375
    * Mean LogLoss ≈ 0.22444
    * Mean Brier ≈ 0.07242
    * Mean Accuracy ≈ 0.89002
- OOF calibration (2000–2022, isotonic):
    * AUC ≈ 0.9719 (stable pre/post calibration)
    * LogLoss: 0.2113 → 0.2099 after calibration
    * Optimal decision threshold by costs: ~0.4959
- Hold-out 2023–2025 (using calibrated probabilities):
    * Mean AUC ≈ 0.9145, LogLoss ≈ 0.3794, Brier ≈ 0.1198
    * Mean Acc@0.5 ≈ 0.8205, Mean Acc@0.496 ≈ 0.8206
- Hold-out by tournament tier (2023–2025, Acc@0.5):
    * Challenger-like:  AUC ≈ 0.9200 | LogLoss ≈ 0.3696 | Brier ≈ 0.1177 | Acc ≈ 0.8228
    * ATP Tour:         AUC ≈ 0.9142 | LogLoss ≈ 0.3738 | Brier ≈ 0.1207 | Acc ≈ 0.8187
    * Masters 1000:     AUC ≈ 0.8971 | LogLoss ≈ 0.4313 | Brier ≈ 0.1352 | Acc ≈ 0.7997
    * Grand Slams:      AUC ≈ 0.9324 | LogLoss ≈ 0.3682 | Brier ≈ 0.1091 | Acc ≈ 0.8443

Design choices that drive these results:
- Strict no-leakage preprocessing with sklearn pipelines and `ColumnTransformer`.
- Categorical encoding via OHE into sparse matrices; numeric features coerced to float32.
- XGBoost `hist` tree method for speed; careful regularization & early stopping.
- “Best iteration” selected on 2022 (hold-out for early stopping), then retraining on ≤2022.
- Isotonic calibration on OOF (2000–2022), applied to 2023–2025 test.
- Efficient batch transforms to minimize overhead.

Usage:
- Set `csv_path` to the dataset CSV exported from the ETL pipeline.
- Run cells in order or execute as a script (they are linear and independent).
- Dependencies: numpy, pandas, scikit-learn, scipy, xgboost.

Notes:
- Metrics above summarize typical outcomes obtained from prior runs on this notebook.
- If your dataset slice differs, numbers may vary slightly.
"""

In [2]:
# ==== 1) Carga ====
csv_path = ""
try:
    df = pd.read_csv(csv_path, low_memory=False, na_values=["", "NA", "NaN"])
    if "tournament_start_dtm" in df.columns:
        df["tournament_start_dtm"] = pd.to_datetime(df["tournament_start_dtm"], errors="coerce")
except FileNotFoundError:
    raise SystemExit(f"No se encontró el archivo: {csv_path}")

# Limpieza de columnas índice exportadas
idx_cols = [c for c in df.columns if c.startswith("Unnamed")]
if "V1" in df.columns:
    idx_cols.append("V1")
if idx_cols:
    df = df.drop(columns=idx_cols)

In [3]:
# ==== 2) Saneos mínimos ====
for c in [c for c in df.columns if c.startswith("log_")]:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan)

In [4]:
# ==== 3) Filtro temporal ====
if "year" not in df.columns:
    raise ValueError("No se encontró la columna 'year'.")
df = df[df["year"] > 1999].copy()

In [5]:
for col in ["match_result", "id"]:
    if col not in df.columns:
        raise ValueError(f"Falta la columna obligatoria '{col}'.")

In [6]:
# ==== 5) y, X base ====
# y binaria ya en el CSV (0/1). Si fuese 'win'/'loss': y = (df["match_result"] == "win").astype(int)
y = df["match_result"].astype(int).values
X = df.drop(columns=["match_result"]).copy()   # <- IMPORTANTÍSIMO: sacar el target de las features

In [7]:
# ==== 6) Categóricas (las tuyas) ====
cat_cols = [
    "surface",
    "player_handedness","player_backhand",
    "opponent_handedness","opponent_backhand",
    "indoor_outdoor",
    "player_rank_trend_4w_cat","player_rank_trend_12w_cat",
    "opponent_rank_trend_4w_cat","opponent_rank_trend_12w_cat",
    "player_win_prob_diff_general_vs_surface_cat",
    "opponent_win_prob_diff_general_vs_surface_cat",
    "best_of",
    "player_home","opponent_home",
    "player_favourite_surface","opponent_favourite_surface",
    "player_good_form_5","player_good_form_10",
    "opponent_good_form_5","opponent_good_form_10",
    "player_won_previous_tournament","opponent_won_previous_tournament",
    "player_back_to_back_week","player_two_weeks_gap","player_long_rest",
    "player_country_changed","player_surface_changed","player_indoor_changed",
    "player_continent_changed","player_red_eye_risk",
    "opponent_back_to_back_week","opponent_two_weeks_gap","opponent_long_rest",
    "opponent_country_changed","opponent_surface_changed",
    "opponent_indoor_changed","opponent_continent_changed",
    "opponent_red_eye_risk",
    "has_player_h2h_surface","has_player_h2h_full"
]
cat_cols = [c for c in cat_cols if c in X.columns]


In [8]:
# ==== 8) Tipos: categóricas a string→object (con np.nan), numéricas a float32 ====
for c in cat_cols:
    X[c] = X[c].astype("string")
X[cat_cols] = X[cat_cols].astype(object)
X[cat_cols] = X[cat_cols].mask(X[cat_cols].isna(), np.nan)

num_cols = [c for c in X.columns if c not in cat_cols]
for c in num_cols:
    if not pdt.is_numeric_dtype(X[c]):
        X[c] = pd.to_numeric(X[c], errors="coerce")
    if pdt.is_numeric_dtype(X[c]):
        X[c] = X[c].astype("float32")

In [9]:
# OneHotEncoder en *sparse*
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="__MISSING__")),
    ("ohe", ohe)
])

# Fuerza salida sparse si hay algo sparse
pre_template = ColumnTransformer(
    transformers=[("cat", cat_transformer, cat_cols)],
    remainder="passthrough",
    sparse_threshold=1.0   # <- clave para que no densifique al concatenar
)

In [10]:
# ==== 10) CV anual (25 folds: 2000..2025) con GARANTÍA por 'id' ====
years_available = sorted(df["year"].unique().tolist())
fold_years = [y for y in range(2000, 2026) if y in years_available]  # 26 folds (2000–2025)
print(f"CV por año con garantía por 'id': {len(fold_years)} folds — años: {fold_years}")

fold_metrics, best_iters = [], []

# (Opcional) Sanidad: cada id en 1 solo año
by_id_years = df.groupby("id")["year"].nunique()
if (by_id_years > 1).any():
    print("Aviso: hay 'id' presentes en >1 año; aun así se agruparán por 'id' en cada fold.")

for fold, y_val in enumerate(fold_years, start=1):
    # ids que aparecen en el año de validación
    ids_val = df.loc[df["year"] == y_val, "id"].unique()

    # máscaras por 'id' (ambas filas del partido van juntas)
    val_mask = df["id"].isin(ids_val)
    train_mask = ~val_mask

    # split
    X_tr_raw, X_va_raw = X.loc[train_mask], X.loc[val_mask]
    y_tr, y_va = y[train_mask], y[val_mask]

    # preprocesamiento sin fuga
    pre_fold = clone(pre_template)
    pre_fold.fit(X_tr_raw)
    Xtr = pre_fold.transform(X_tr_raw)
    Xva = pre_fold.transform(X_va_raw)


    if not sparse.isspmatrix_csr(Xtr):
        Xtr = sparse.csr_matrix(Xtr)
    if not sparse.isspmatrix_csr(Xva):
        Xva = sparse.csr_matrix(Xva)
    
    Xtr = Xtr.astype(np.float32)
    Xva = Xva.astype(np.float32)


    # entrenamiento con xgboost.train (+ early stopping) — versión ligera
    params = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "auc"],
        "tree_method": "hist",
        "eta": 0.05,              # antes 0.05
        "max_depth": 6,           # antes 7
        "min_child_weight": 12,   # antes 8
        "subsample": 0.8,
        "colsample_bytree": 0.65, # antes 0.7
        "colsample_bynode": 0.8,
        "gamma": 2.0,             # antes 1.0
        "lambda": 1.5,            # antes 1.0
        "alpha": 0.8,             # antes 0.5
        "nthread": 8,
        "seed": 42
    }
    dtrain = xgb.DMatrix(Xtr, label=y_tr)
    dvalid = xgb.DMatrix(Xva, label=y_va)
    es = XGBEarlyStopping(rounds=50, save_best=True, maximize=False, metric_name="logloss")  # antes 100
    booster = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=2000,     # antes 4000
        evals=[(dvalid, "valid")],
        callbacks=[es],
        verbose_eval=200
    )


    # métricas
    if y_va.size and len(np.unique(y_va)) > 1:
        p_va = booster.predict(dvalid)
        auc = float(roc_auc_score(y_va, p_va))
        ll  = float(log_loss(y_va, p_va))
        br  = float(brier_score_loss(y_va, p_va))
        acc = float(((p_va >= 0.5) == y_va).mean())   # accuracy con umbral 0.5
    else:
        auc = ll = br = acc = np.nan

    # mejor iteración (robusto)
    best_iter = getattr(booster, "best_iteration", None)
    if best_iter is None:
        best_iter = getattr(booster, "best_ntree_limit", None)
        if best_iter is None:
            best_iter = 4000
    best_iter = int(best_iter)

    fold_metrics.append({
        "fold": fold,
        "year_valid": int(y_val),
        "AUC": auc, "LogLoss": ll, "Brier": br, "Accuracy": acc,
        "best_iteration": best_iter
    })
    best_iters.append(best_iter)

    del booster, dtrain, dvalid, Xtr, Xva, pre_fold, X_tr_raw, X_va_raw, y_tr, y_va
    gc.collect()

    print(f"Fold {fold} | year {y_val} | "
          f"AUC={auc:.4f}  LogLoss={ll:.4f}  Brier={br:.4f}  Acc={acc:.4f}  "
          f"best_iter={best_iter}")

# ==== 11) Resumen CV ====
valid_aucs   = [m["AUC"] for m in fold_metrics if not np.isnan(m["AUC"])]
valid_lls    = [m["LogLoss"] for m in fold_metrics if not np.isnan(m["LogLoss"])]
valid_briers = [m["Brier"] for m in fold_metrics if not np.isnan(m["Brier"])]
valid_accs   = [m["Accuracy"] for m in fold_metrics if not np.isnan(m["Accuracy"])]

print("\n=== CV (1 fold por año; garantía por 'id') ===")
print(f"Folds válidos: {len(valid_aucs)} / {len(fold_years)}")
print(f"Mean AUC:      {np.mean(valid_aucs):.5f}"  if valid_aucs else "Mean AUC: n/a")
print(f"Mean LogLoss:  {np.mean(valid_lls):.5f}"   if valid_lls else "Mean LogLoss: n/a")
print(f"Mean Brier:    {np.mean(valid_briers):.5f}"if valid_briers else "Mean Brier: n/a")
print(f"Mean Accuracy: {np.mean(valid_accs):.5f}"  if valid_accs else "Mean Accuracy: n/a")

CV por año con garantía por 'id': 26 folds — años: [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
[0]	valid-logloss:0.67906	valid-auc:0.74039
[200]	valid-logloss:0.45785	valid-auc:0.83105
[217]	valid-logloss:0.45805	valid-auc:0.83150
Fold 1 | year 2000 | AUC=0.8308  LogLoss=0.4567  Brier=0.1598  Acc=0.7154  best_iter=167
[0]	valid-logloss:0.66611	valid-auc:0.93306
[200]	valid-logloss:0.17782	valid-auc:0.98144
[400]	valid-logloss:0.17209	valid-auc:0.98228
[600]	valid-logloss:0.16925	valid-auc:0.98260
[800]	valid-logloss:0.16763	valid-auc:0.98272
[910]	valid-logloss:0.16763	valid-auc:0.98264
Fold 2 | year 2001 | AUC=0.9827  LogLoss=0.1674  Brier=0.0531  Acc=0.9190  best_iter=860
[0]	valid-logloss:0.66632	valid-auc:0.92722
[200]	valid-logloss:0.17819	valid-auc:0.98159
[400]	valid-logloss:0.17160	valid-auc:0.98247
[600]	valid-logloss:0.16799	valid-auc:0.98299
[800]	valid-logloss:0.

In [11]:
# ==== 12) OOF 2000–2022: calibración isotónica + umbral por costes ====
from sklearn.isotonic import IsotonicRegression

def find_optimal_threshold(y_true, prob, cost_fp=1.0, cost_fn=1.0):
    y = np.asarray(y_true, dtype=int)
    p = np.asarray(prob, dtype=float)
    order = np.argsort(-p)  # desc
    p_sorted = p[order]
    y_sorted = y[order]
    pos = (y_sorted == 1).astype(int)
    neg = 1 - pos
    pref_pos = np.concatenate([[0], np.cumsum(pos)])
    pref_neg = np.concatenate([[0], np.cumsum(neg)])
    total_pos = int(pref_pos[-1])
    i = np.arange(0, len(p_sorted) + 1)  # nº predicciones positivas
    FP = pref_neg[i]
    FN = total_pos - pref_pos[i]
    cost = cost_fp * FP + cost_fn * FN
    i_best = int(np.argmin(cost))
    if i_best == 0:
        thr = 1.0 + 1e-12
    elif i_best == len(p_sorted):
        thr = 0.0
    else:
        thr = (p_sorted[i_best - 1] + p_sorted[i_best]) / 2.0
    return float(thr), float(cost[i_best] / len(y))

years_all = sorted(df["year"].unique())
oof_years = [yy for yy in years_all if 2000 <= yy <= 2022]
oof_pred = np.full(df.shape[0], np.nan, dtype=np.float32)
mask_pool = df["year"].isin(oof_years).values

print(f"\nConstruyendo OOF para calibración en {len(oof_years)} años (2000–2022)...")
for k, y_val in enumerate(oof_years, start=1):
    ids_val = df.loc[df["year"] == y_val, "id"].unique()
    val_mask = df["id"].isin(ids_val).values
    train_mask = mask_pool & (~val_mask)

    X_tr_raw, X_va_raw = X.loc[train_mask], X.loc[val_mask]
    y_tr, y_va = y[train_mask], y[val_mask]

    pre_fold = clone(pre_template)
    pre_fold.fit(X_tr_raw)
    Xtr = pre_fold.transform(X_tr_raw)
    Xva = pre_fold.transform(X_va_raw)

    if not sparse.isspmatrix_csr(Xtr):
        Xtr = sparse.csr_matrix(Xtr)
    if not sparse.isspmatrix_csr(Xva):
        Xva = sparse.csr_matrix(Xva)
    Xtr = Xtr.astype(np.float32)
    Xva = Xva.astype(np.float32)

    params_oof = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "auc"],
        "tree_method": "hist",
        "eta": 0.10, "max_depth": 6, "min_child_weight": 12,
        "subsample": 0.8, "colsample_bytree": 0.65, "colsample_bynode": 0.8,
        "gamma": 2.0, "lambda": 1.5, "alpha": 0.8,
        "nthread": 8, "seed": 42
    }
    dtr = xgb.DMatrix(Xtr, label=y_tr)
    dva = xgb.DMatrix(Xva, label=y_va)
    es_oof = XGBEarlyStopping(rounds=50, save_best=True, maximize=False, metric_name="logloss")
    booster = xgb.train(
        params=params_oof,
        dtrain=dtr,
        num_boost_round=2000,
        evals=[(dva, "valid")],
        callbacks=[es_oof],
        verbose_eval=False
    )

    oof_pred[val_mask] = booster.predict(dva).astype(np.float32)

    del booster, dtr, dva, pre_fold, Xtr, Xva, X_tr_raw, X_va_raw, y_tr, y_va
    gc.collect()
    print(f"OOF fold {k}/{len(oof_years)} (año {y_val}) listo")

idx = np.where(mask_pool & ~np.isnan(oof_pred))[0]
y_oof = y[idx]
p_oof = oof_pred[idx]

# Calibración isotónica
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(p_oof, y_oof)
p_oof_cal = iso.predict(p_oof)

# Métricas OOF (sin y con calibración)
def _m(ytrue, p):
    return (
        float(roc_auc_score(ytrue, p)),
        float(log_loss(ytrue, p)),
        float(brier_score_loss(ytrue, p)),
        float(((p >= 0.5).astype(int) == ytrue).mean())
    )

auc_r, ll_r, br_r, acc_r = _m(y_oof, p_oof)
auc_c, ll_c, br_c, acc_c = _m(y_oof, p_oof_cal)
print(f"\nOOF sin calib:  AUC {auc_r:.4f} | LogLoss {ll_r:.4f} | Brier {br_r:.4f} | Acc@0.5 {acc_r:.4f}")
print(f"OOF calibrado:  AUC {auc_c:.4f} | LogLoss {ll_c:.4f} | Brier {br_c:.4f} | Acc@0.5 {acc_c:.4f}")

# Umbral por costes (inicialmente cost_fp=1, cost_fn=1)
thr_star, cost_star = find_optimal_threshold(y_oof, p_oof_cal, cost_fp=1.0, cost_fn=1.0)
print(f"Umbral óptimo por costes -> thr* = {thr_star:.4f} | coste medio {cost_star:.5f}")


Construyendo OOF para calibración en 23 años (2000–2022)...
OOF fold 1/23 (año 2000) listo
OOF fold 2/23 (año 2001) listo
OOF fold 3/23 (año 2002) listo
OOF fold 4/23 (año 2003) listo
OOF fold 5/23 (año 2004) listo
OOF fold 6/23 (año 2005) listo
OOF fold 7/23 (año 2006) listo
OOF fold 8/23 (año 2007) listo
OOF fold 9/23 (año 2008) listo
OOF fold 10/23 (año 2009) listo
OOF fold 11/23 (año 2010) listo
OOF fold 12/23 (año 2011) listo
OOF fold 13/23 (año 2012) listo
OOF fold 14/23 (año 2013) listo
OOF fold 15/23 (año 2014) listo
OOF fold 16/23 (año 2015) listo
OOF fold 17/23 (año 2016) listo
OOF fold 18/23 (año 2017) listo
OOF fold 19/23 (año 2018) listo
OOF fold 20/23 (año 2019) listo
OOF fold 21/23 (año 2020) listo
OOF fold 22/23 (año 2021) listo
OOF fold 23/23 (año 2022) listo

OOF sin calib:  AUC 0.9719 | LogLoss 0.2113 | Brier 0.0679 | Acc@0.5 0.8974
OOF calibrado:  AUC 0.9719 | LogLoss 0.2099 | Brier 0.0678 | Acc@0.5 0.8975
Umbral óptimo por costes -> thr* = 0.4959 | coste medio 0.1

In [12]:
#==== 13) ==== Entrenamiento final con hold-out (train<=2022, test 2023–2025) — versión eficiente ====
mask_tr_es     = df["year"] <= 2021
mask_va_es     = df["year"] == 2022
mask_fit_final = df["year"] <= 2022
mask_test      = (df["year"] >= 2023) & (df["year"] <= 2025)

# --- Early stopping en 2022 (preprocesador SOLO con <=2021, para no fugar) ---
pre_es  = clone(pre_template)
Xtr_es  = pre_es.fit_transform(X.loc[mask_tr_es])   # 1 pasada
Xva_es  = pre_es.transform(X.loc[mask_va_es])

if not sparse.isspmatrix_csr(Xtr_es): Xtr_es = sparse.csr_matrix(Xtr_es)
if not sparse.isspmatrix_csr(Xva_es): Xva_es = sparse.csr_matrix(Xva_es)
Xtr_es = Xtr_es.astype(np.float32)
Xva_es = Xva_es.astype(np.float32)

params_es = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "tree_method": "hist",
    "eta": 0.10, "max_depth": 6, "min_child_weight": 12,
    "subsample": 0.8, "colsample_bytree": 0.65, "colsample_bynode": 0.8,
    "gamma": 2.0, "lambda": 1.5, "alpha": 0.8,
    "nthread": 8, "seed": 42
}
dtr_es = xgb.DMatrix(Xtr_es, label=y[mask_tr_es])
dva_es = xgb.DMatrix(Xva_es, label=y[mask_va_es])
es_cb  = XGBEarlyStopping(rounds=50, save_best=True, maximize=False, metric_name="logloss")

booster_es = xgb.train(
    params_es, dtr_es, num_boost_round=2000,
    evals=[(dva_es, "valid")], callbacks=[es_cb], verbose_eval=100
)

best_iter = getattr(booster_es, "best_iteration", None)
if best_iter is None:
    best_iter = getattr(booster_es, "best_ntree_limit", 1000)
best_iter = int(best_iter)
print(f"\n[Hold-out] best_iter={best_iter}")

# Limpieza temprana
del pre_es, Xtr_es, Xva_es, dtr_es, dva_es, booster_es
gc.collect()

# --- Reentrenar final con <=2022 usando best_iter (nuevo preprocesador con <=2022) ---
pre_final = clone(pre_template)
X_final   = pre_final.fit_transform(X.loc[mask_fit_final])  # 1 pasada

if not sparse.isspmatrix_csr(X_final): X_final = sparse.csr_matrix(X_final)
X_final = X_final.astype(np.float32)

d_final = xgb.DMatrix(X_final, label=y[mask_fit_final])
booster_final = xgb.train(params_es, d_final, num_boost_round=best_iter, evals=[], verbose_eval=False)

# Limpieza intermedia
del X_final, d_final
gc.collect()

# --- TEST 2023–2025: transformar y predecir en UN SOLO BATCH ---
if mask_test.any():
    Xt_test = pre_final.transform(X.loc[mask_test])
    if not sparse.isspmatrix_csr(Xt_test): Xt_test = sparse.csr_matrix(Xt_test)
    Xt_test = Xt_test.astype(np.float32)

    dtest = xgb.DMatrix(Xt_test)
    p_raw_test = booster_final.predict(dtest)
    p_cal_test = iso.predict(p_raw_test)  # calibración OOF 2000–2022
    y_test     = y[mask_test]
    years_test = df.loc[mask_test, "year"].astype(int).values

    # métricas por año sin repetir transform/predict
    print("\n=== Test 2023–2025 (probabilidades calibradas) ===")
    mean_vals = {"auc": [], "ll": [], "br": [], "acc05": [], "accst": []}
    for yr in (2023, 2024, 2025):
        m = (years_test == yr)
        if not np.any(m): 
            continue
        y_true = y_test[m]
        p      = p_cal_test[m]
        auc = float(roc_auc_score(y_true, p))
        ll  = float(log_loss(y_true, p))
        br  = float(brier_score_loss(y_true, p))
        acc05 = float(((p >= 0.5).astype(int) == y_true).mean())
        accst = float(((p >= thr_star).astype(int) == y_true).mean())
        print(f"Año {yr} | AUC {auc:.4f} | LogLoss {ll:.4f} | Brier {br:.4f} | "
              f"Acc@0.5 {acc05:.4f} | Acc@{thr_star:.3f} {accst:.4f}")
        mean_vals["auc"].append(auc); mean_vals["ll"].append(ll); mean_vals["br"].append(br)
        mean_vals["acc05"].append(acc05); mean_vals["accst"].append(accst)

    if mean_vals["auc"]:
        from statistics import mean
        print(f"\nMedia 2023–2025 | AUC {mean(mean_vals['auc']):.4f} | "
              f"LogLoss {mean(mean_vals['ll']):.4f} | Brier {mean(mean_vals['br']):.4f} | "
              f"Acc@0.5 {mean(mean_vals['acc05']):.4f} | Acc@{thr_star:.3f} {mean(mean_vals['accst']):.4f}")

    # limpieza test
    del Xt_test, dtest, p_raw_test, p_cal_test, y_test, years_test
    gc.collect()
else:
    print("\nNo hay filas en 2023–2025 en este dataset.")

[0]	valid-logloss:0.64551	valid-auc:0.89403
[100]	valid-logloss:0.24373	valid-auc:0.96321
[200]	valid-logloss:0.23502	valid-auc:0.96556
[300]	valid-logloss:0.23250	valid-auc:0.96615
[400]	valid-logloss:0.23155	valid-auc:0.96640
[500]	valid-logloss:0.23059	valid-auc:0.96664
[600]	valid-logloss:0.23020	valid-auc:0.96665
[700]	valid-logloss:0.22961	valid-auc:0.96675
[800]	valid-logloss:0.22925	valid-auc:0.96682
[836]	valid-logloss:0.22945	valid-auc:0.96675

[Hold-out] best_iter=786

=== Test 2023–2025 (probabilidades calibradas) ===
Año 2023 | AUC 0.9626 | LogLoss 0.2421 | Brier 0.0783 | Acc@0.5 0.8803 | Acc@0.496 0.8802
Año 2024 | AUC 0.8895 | LogLoss 0.4475 | Brier 0.1413 | Acc@0.5 0.7871 | Acc@0.496 0.7881
Año 2025 | AUC 0.8915 | LogLoss 0.4485 | Brier 0.1399 | Acc@0.5 0.7940 | Acc@0.496 0.7934

Media 2023–2025 | AUC 0.9145 | LogLoss 0.3794 | Brier 0.1198 | Acc@0.5 0.8205 | Acc@0.496 0.8206


In [13]:
# === Hold-out 2023–2025 por tipo de torneo (train <=2022), umbral 0.5 ===

# 1) Máscaras temporales
mask_tr_es     = df["year"] <= 2021
mask_va_es     = df["year"] == 2022
mask_fit_final = df["year"] <= 2022
mask_test      = (df["year"] >= 2023) & (df["year"] <= 2025)

# 2) Features sin 'id' por seguridad
X_use = X.drop(columns=["id"], errors="ignore")

# 3) Early stopping en 2022 para elegir best_iter
pre_es = clone(pre_template)
Xtr_es = pre_es.fit_transform(X_use.loc[mask_tr_es])
Xva_es = pre_es.transform(X_use.loc[mask_va_es])

if not sparse.isspmatrix_csr(Xtr_es): Xtr_es = sparse.csr_matrix(Xtr_es)
if not sparse.isspmatrix_csr(Xva_es): Xva_es = sparse.csr_matrix(Xva_es)
Xtr_es = Xtr_es.astype(np.float32); Xva_es = Xva_es.astype(np.float32)

params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "auc"],
    "tree_method": "hist",
    "eta": 0.05,
    "max_depth": 6,
    "min_child_weight": 12,
    "subsample": 0.8,
    "colsample_bytree": 0.65,
    "colsample_bynode": 0.8,
    "gamma": 2.0,
    "lambda": 1.5,
    "alpha": 0.8,
    "nthread": 8,
    "seed": 42
}
dtr_es = xgb.DMatrix(Xtr_es, label=y[mask_tr_es])
dva_es = xgb.DMatrix(Xva_es, label=y[mask_va_es])

es_cb = XGBEarlyStopping(rounds=50, save_best=True, maximize=False, metric_name="logloss")
booster_es = xgb.train(params, dtr_es, num_boost_round=2000, evals=[(dva_es, "valid")], callbacks=[es_cb], verbose_eval=100)

best_iter = getattr(booster_es, "best_iteration", None)
if best_iter is None:
    best_iter = getattr(booster_es, "best_ntree_limit", 1000)
best_iter = int(best_iter)
print(f"[Hold-out selector] best_iter={best_iter}")

# Limpieza
del pre_es, Xtr_es, Xva_es, dtr_es, dva_es, booster_es
gc.collect()

# 4) Reentrenar final con <=2022 usando best_iter
pre_final = clone(pre_template)
X_final = pre_final.fit_transform(X_use.loc[mask_fit_final])
if not sparse.isspmatrix_csr(X_final): X_final = sparse.csr_matrix(X_final)
X_final = X_final.astype(np.float32)

d_final = xgb.DMatrix(X_final, label=y[mask_fit_final])
booster_final = xgb.train(params, d_final, num_boost_round=best_iter, evals=[], verbose_eval=False)

del X_final, d_final
gc.collect()

# 5) Transformar y predecir TODO 2023–2025 de una vez
if mask_test.any():
    Xt_test = pre_final.transform(X_use.loc[mask_test])
    if not sparse.isspmatrix_csr(Xt_test): Xt_test = sparse.csr_matrix(Xt_test)
    Xt_test = Xt_test.astype(np.float32)
    dtest   = xgb.DMatrix(Xt_test)
    p_raw   = booster_final.predict(dtest)

    # Usa calibración si 'iso' existe; si no, usa probas crudas
    try:
        p_test = iso.predict(p_raw)
    except NameError:
        p_test = p_raw

    y_test     = y[mask_test]
    years_test = df.loc[mask_test, "year"].astype(int).values

    # Serie segura para operaciones de texto (maneja NA)
    tc_s = (
        df.loc[mask_test, "tournament_category"]
          .astype("string")
          .str.strip()
          .str.lower()
    )

    def mask_eq_or_starts(s: pd.Series, token: str) -> pd.Series:
        tok = token.lower()
        return s.eq(tok) | s.str.startswith(tok, na=False)

    groups = [
        ("ch",   "Challenger-like"),
        ("atp",  "ATP Tour"),
        ("1000", "Masters 1000"),
        ("gs",   "Grand Slams"),
    ]

    print("\n=== Hold-out 2023–2025 por tipo de torneo (umbral 0.5) ===")
    for token, label in groups:
        m = mask_eq_or_starts(tc_s, token).to_numpy()
        if not np.any(m):
            print(f"{label}: sin datos en 2023–2025.")
            continue

        y_g = y_test[m]
        p_g = p_test[m]
        auc = float(roc_auc_score(y_g, p_g)) if len(np.unique(y_g)) > 1 else np.nan
        ll  = float(log_loss(y_g, p_g))
        br  = float(brier_score_loss(y_g, p_g))
        acc = float(((p_g >= 0.5).astype(int) == y_g).mean())
        print(f"{label} | n={m.sum()} | AUC {auc:.4f} | LogLoss {ll:.4f} | Brier {br:.4f} | Acc@0.5 {acc:.4f}")

        # (Opcional) métricas por año dentro del grupo
        for yr in (2023, 2024, 2025):
            my = m & (years_test == yr)
            if not np.any(my):
                continue
            yg, pg = y_test[my], p_test[my]
            aucy = float(roc_auc_score(yg, pg)) if len(np.unique(yg)) > 1 else np.nan
            lly  = float(log_loss(yg, pg))
            bry  = float(brier_score_loss(yg, pg))
            accy = float(((pg >= 0.5).astype(int) == yg).mean())
            print(f"   - Año {yr} | n={my.sum()} | AUC {aucy:.4f} | LogLoss {lly:.4f} | Brier {bry:.4f} | Acc@0.5 {accy:.4f}")

    # limpieza test
    del Xt_test, dtest, p_raw, p_test, y_test, years_test, tc_s
    gc.collect()
else:
    print("No hay filas en 2023–2025.")


[0]	valid-logloss:0.66215	valid-auc:0.94571
[100]	valid-logloss:0.25757	valid-auc:0.96049
[200]	valid-logloss:0.24299	valid-auc:0.96343
[300]	valid-logloss:0.23839	valid-auc:0.96467
[400]	valid-logloss:0.23515	valid-auc:0.96554
[500]	valid-logloss:0.23347	valid-auc:0.96601
[600]	valid-logloss:0.23181	valid-auc:0.96640
[700]	valid-logloss:0.23102	valid-auc:0.96662
[800]	valid-logloss:0.23020	valid-auc:0.96683
[900]	valid-logloss:0.22968	valid-auc:0.96694
[1000]	valid-logloss:0.22945	valid-auc:0.96698
[1100]	valid-logloss:0.22913	valid-auc:0.96705
[1200]	valid-logloss:0.22912	valid-auc:0.96705
[1300]	valid-logloss:0.22879	valid-auc:0.96711
[1365]	valid-logloss:0.22864	valid-auc:0.96713
[Hold-out selector] best_iter=1315

=== Hold-out 2023–2025 por tipo de torneo (umbral 0.5) ===
Challenger-like | n=54682 | AUC 0.9200 | LogLoss 0.3696 | Brier 0.1177 | Acc@0.5 0.8228
   - Año 2023 | n=18968 | AUC 0.9638 | LogLoss 0.2380 | Brier 0.0772 | Acc@0.5 0.8831
   - Año 2024 | n=20106 | AUC 0.8942 |