# Requirements and Libraries

In [1]:
pip install xgboost lightgbm catboost lifelines optuna

Note: you may need to restart the kernel to use updated packages.


In [2]:
# aquí dejo un "special import" (luego tengo temas con este import)
def _pip_install(pkg):
    try:
        __import__(pkg.split('==')[0].replace("-","_"))
    except ImportError:
        import subprocess, sys
        subprocess.check_call([sys.executeable, "-m", "pip", "install", pkg])
        
for pkg in ['holidays']:
    _pip_install(pkg)
        

In [3]:
#basics
import os
import sys
import re
import json
import warnings
import joblib
import unicodedata
import logging

#data manipulation
import pandas as pd
import numpy as np
import typing as t
import holidays

#Data Viz
import seaborn as sns
import matplotlib.pyplot as plt

#info
from pathlib import Path
from zoneinfo import ZoneInfo
from dataclasses import dataclass
from datetime import datetime, date

#google bigquery
from google.cloud import bigquery

#statistics
import scipy
from scipy import stats
from scipy.stats import ks_2samp
from lifelines import KaplanMeierFitter, CoxPHFitter
from math import pi
#sklearn
import sklearn
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.utils import Bunch
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.calibration import IsotonicRegression, calibration_curve
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

#modelos
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


#metrics
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss,confusion_matrix,classification_report
from fairlearn.metrics import MetricFrame


#optimization momdels
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer

warnings.filterwarnings('ignore')
RNG_SEED = 42
np.random.seed(RNG_SEED)


# Configuración de logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)

logger.info("✓ Librerías importadas y configuración completa")

# Data Load

In [4]:
client = bigquery.Client(project="spin-aip-singularity-comp-sb")

query = """ SELECT * FROM `spin-aip-singularity-comp-sb.model_activation.dataste_model_activation_timewindow_30D_V-1-5-0`"""

data = client.query(query).to_dataframe()



In [5]:
data.head()

Unnamed: 0,user_id,signup_date,signup_ts,userTypeIdentifier,channelUserIdentifier,accountLevel,stateName,gender,user_type,channelDetail,birth_date,birthState,Card_linked_date,IsActive,phn_confir,email_confir,phone_conf_ts,email_conf_ts,phn_confir_d7,email_confir_d7,both_confir_d7,premia_accountid,has_premia,activation_date_ever,activation_date_30d,label_activated_30d,tx_30d_count,tx_30d_amount,label_5tx_30d,first_tx_type,first_tx_amount,activation_channel,latest_tx_date,lifespan_days,days_since_last,tx_30d_from_activation,days_to_first_activation
0,cc4ed9e9-1c5b-48bf-8a29-9c047efb0c09,2025-05-17,2025-05-17 23:09:45.815000+00:00,3,1,3,CO,female,HYBRID,POS,1999-12-25,CL,2025-05-17,True,1,1,2025-05-18 23:23:48.667000+00:00,2025-05-21 03:25:21.647000+00:00,1,1,1,M478OM4,1,2025-05-18,2025-05-18,1,73,19840.65,1,TRANSFER_TO_CARD,400.0,SPEI/Transfer,2025-11-17,184,4,76,1
1,cc4ffa4d-6a98-4577-b86f-d25debb0e8d1,2025-04-21,2025-04-21 21:08:53.993000+00:00,3,2,3,NL,female,HYBRID,ORGANIC,2005-11-09,VZ,2025-04-21,True,1,1,2025-04-22 03:09:03.281000+00:00,2025-06-19 11:27:02.956000+00:00,1,0,0,,0,2025-04-23,2025-04-23,1,40,3157.77,1,CASH_IN_AT_OXXO,380.0,CashIn_OXXO,2025-11-19,211,2,41,2
2,cc517a82-58ce-4267-aa84-ce34bb0677ca,2025-09-19,2025-09-19 18:46:43.352000+00:00,3,1,2,YU,male,HYBRID,POS,1985-09-23,YU,2025-09-19,True,1,0,2025-09-20 02:08:16.336000+00:00,NaT,1,0,0,SGP33SR,1,2025-09-20,2025-09-20,1,105,32864.46,1,CARD_PURCHASE,298.5,Card,2025-11-20,62,1,108,1
3,cc51c892-65e4-48a3-912d-aa1baec3492c,2025-05-16,2025-05-16 17:33:22.792000+00:00,3,2,2,MC,male,HYBRID,ORGANIC,1967-04-09,MC,2025-05-16,True,1,1,2025-05-16 23:33:33.606000+00:00,2025-05-19 20:45:56.557000+00:00,1,1,1,IKGVB3C,1,2025-05-16,2025-05-16,1,46,8916.07,1,CASH_OUT_WITH_CARD_AT_OXXO,769.67,Card,2025-11-17,186,4,46,0
4,cc5283bd-61ee-472d-a2c9-db7bc657f83b,2025-01-20,2025-01-20 08:01:54.389000+00:00,2,2,3,VZ,female,DIGITAL,ORGANIC,1989-06-23,VE,NaT,True,1,1,2025-01-20 14:02:00.835000+00:00,2025-01-20 14:11:49.317000+00:00,1,1,1,1d0de28f-947c-4585-9cbd-1816bc72afda,1,2025-01-20,2025-01-20,1,39,14758.0,1,TRANSFER_TO_CLABE,1700.0,SPEI/Transfer,2025-09-08,232,74,39,0


In [6]:
df = data.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384265 entries, 0 to 2384264
Data columns (total 37 columns):
 #   Column                    Dtype              
---  ------                    -----              
 0   user_id                   object             
 1   signup_date               dbdate             
 2   signup_ts                 datetime64[us, UTC]
 3   userTypeIdentifier        Int64              
 4   channelUserIdentifier     Int64              
 5   accountLevel              Int64              
 6   stateName                 object             
 7   gender                    object             
 8   user_type                 object             
 9   channelDetail             object             
 10  birth_date                dbdate             
 11  birthState                object             
 12  Card_linked_date          dbdate             
 13  IsActive                  boolean            
 14  phn_confir                Int64              
 15  email_confir   

In [7]:
df.shape

(2384265, 37)

# Feature Engineering

In [8]:
@dataclass
class Config:
    project_id: str = "spin-aip-singularity-comp-sb"
    table_fqn: str = "spin-aip-singularity-comp-sb.model_activation.dataste_model_activation_timewindow_30D_V-1-5-0"
    label_col: str = "label_5tx_30d"
    signup_ts_col: str = "signup_ts"
    signup_date_col: str = "signup_date"
    tz_local: str = "America/Mexico_City"
    embargo_days: int = 3      
    holdout_days: int = 14 
    n_splits: int = 5
    train_sample_frac: float = 1.0
    activation_filter: int = 1
    random_state: int = RNG_SEED
    lift_fracs: t.Tuple[float, ...] = (0.01, 0.02, 0.05, 0.10)

CFG = Config()

In [9]:
# Leakage / IDs a ignorar en features
LEAKY_ALWAYS = {    
    #"label_activated_30d"
    "y_w0",
    "y_w1",
    "y_w7",
    "y_w30",
    "y_cum30",
    "label_5tx_30d",
    # info post-activación / post-window
    #"activation_date_ever",
    #"activation_date_30d",
    #"days_to_first_activation",
    #"tx_30d_count",
    #"tx_30d_amount",
    #"tx_30d_from_activation", 
    #"first_tx_type",
    #"first_tx_amount",
    #"latest_tx_date",
    # confirmaciones si no son estrictamente previas al cutoff de cada horizonte
#    "phn_confir","email_confir",
    #"phn_confir_d7",
    #"email_confir_d7",
    #"both_confir_d7",
    #'Card_linked_date',
    #"activation_*",
   # "*_30d_*",
    "*latest_tx*",
    #ID
    "user_id", 
    "userid",
    "channelUserIdentifier",
    "premia_accountid", 
    "accountid", 
    "member_id", 
    "spin_user_id", 
    "id"
}

In [10]:
LEAK_BAN = {
    #"activation_date_30d"
    "y_w0",
    "y_w1",
    "y_w7",
    "y_w30",
    "y_cum30",
    "label_activated_30d",
    "label_5tx_30d",
    #"activation_date_ever",
    #"days_to_first_activation",
    #"latest_tx_date",
    #"tx_30d_count",
    #"tx_30d_amount",
    #"tx_30d_from_activation",
    #"first_tx_type",
    #"first_tx_amount",
    #"activation_channel"
}

In [11]:
# Regex anti-leak (además del set LEAKY_ALWAYS existente)
LEAK_PATTERNS = [
    #r"(^|_)activation(_|$)", 
    #r"(^|_)first_tx(_|$)", 
    #r"(^|_)latest_tx(_|$)",
    #r"(^|_)tx_30d(_|$)", 
    #r_)days_to_first(_|$)", 
 #   r"(^|_)from_activation(_|$)"
]

In [12]:
def assert_no_regex_leak(df_like: pd.DataFrame):
    bad = []
    for c in df_like.columns:
        for pat in LEAK_PATTERNS:
            if re.search(pat, c, flags=re.IGNORECASE):
                bad.append(c); break
    assert len(bad) == 0, f"LEAKAGE by regex: quita columnas {sorted(set(bad))}"

def assert_no_labelish_cols(df_like):
    inter = [c for c in df_like.columns if c in LEAK_BAN]
    assert len(inter) == 0, f"LEAKAGE: quita columnas {inter}"


EXTRA_DROP_TS = {"phone_conf_ts", "email_conf_ts"}

# Normalización de estados (stateName -> siglas)
def _norm(s: str) -> str:
    s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    s = re.sub(r"[^A-Z ]", "", s.upper())
    return re.sub(r"\s+", " ", s).strip()

STATE_TO_ABBR = {
    "AGUASCALIENTES":"AG","BAJA CALIFORNIA":"BC","BAJA CALIFORNIA SUR":"BS","CAMPECHE":"CM",
    "CHIAPAS":"CS","CHIHUAHUA":"CH","CIUDAD DE MEXICO":"DF","COAHUILA":"CO","COLIMA":"CL",
    "DURANGO":"DG","GUANAJUATO":"GT","GUERRERO":"GR","HIDALGO":"HG","JALISCO":"JA","MEXICO":"EM",
    "MICHOACAN":"MI","MORELOS":"MO","NAYARIT":"NA","NUEVO LEON":"NL","OAXACA":"OA","PUEBLA":"PU",
    "QUERETARO":"QT","QUINTANA ROO":"QR","SAN LUIS POTOSI":"SL","SINALOA":"SI","SONORA":"SO",
    "TABASCO":"TB","TAMAULIPAS":"TM","TLAXCALA":"TL","VERACRUZ":"VE","YUCATAN":"YU","ZACATECAS":"ZA"
}
STATE_SYNONYMS = {"CDMX":"CIUDAD DE MEXICO","ESTADO DE MEXICO":"MEXICO","EDOMEX":"MEXICO"}

# birthState canon + buckets regionales
CANON = {
    "SR":"SO","SO":"SO","VZ":"VE","VE":"VE","YN":"YU","YU":"YU","JC":"JA","JA":"JA","MC":"MI","MI":"MI",
    "TS":"TM","TM":"TM","TC":"TB","TB":"TB","CC":"CL","CL":"CL","DF":"DF","EM":"EM","NL":"NL","BC":"BC",
    "BS":"BS","SI":"SI","NA":"NA","DG":"DG","ZA":"ZA","AG":"AG","SL":"SL","HG":"HG","MO":"MO","TL":"TL",
    "PU":"PU","QT":"QT","GT":"GT","OA":"OA","CM":"CM","CS":"CS","CO":"CO","GR":"GR","QR":"QR","CH":"CH",
    "MS":"MI","MN":"MI","SP":"SL","NE":"NL","OC":"OA","PL":"PU","NT":"NA","ZS":"ZA","AS":"AG","UN":"OT", None:"OT"
}
REGION_BUCKET = {
    "BC":1,"SO":1,"CH":1,"CO":1,"NL":1,"TM":1,                # Norte
    "BS":2,"SI":2,"NA":2,"DG":2,"ZA":2,                       # Norte-Occidente
    "JA":3,"AG":3,"CL":3,"MI":3,"SL":3,                       # Centro-Norte
    "DF":4,"EM":4,"HG":4,"MO":4,"TL":4,"PU":4,"QT":4,"GT":4,  # Centro-País
    "CS":5,"TB":5,"CM":5,"YU":5,"QR":5,"OA":5,"GR":5,"VE":5,  # Sur-Sureste
    "OT":0
}

# Mapeos categóricos
GENDER_MAP = {"female":1, "male":0}
USER_TYPE_MAP = {"HYBRID":0, "DIGITAL":1, "ANALOG":2}
CHANNEL_DETAIL_MAP = {"ORGANIC":0,"COLLABORATOR":1,"POS":2,"SPIN_PREMIA":3,"DIGITAL_ORGANIC":4,"DIGITAL":5}


CASH_IN_TX = {
        "CASH_IN_AT_OXXO",
        "CASH_IN_AT_OXXO_QR",
        "CASH_OUT_WITH_CARD_AT_OXXO", 
        "CASH_OUT_AT_OXXO",
        "CASH_OUT_AT_MERCHANT",
        "CARD_PURCHASE",
        "CARD_ATM_WITHDRAWAL"
}
SPEI_TX = {
        "SPEI_CASH_IN",
        "TRANSFER_TO_CARD",
        "TRANSFER_TO_CLABE",
        "P2P_TRANSFER_TARGET",
        "P2P_TRANSFER_TARGET_CLABE",
        "P2P_TRANSFER_TARGET",
        "P2P_TRANSFER_TARGET_CARD"}

P2P_TX ={
        "P2P_TRANSFER_TARGET_CLABE",
         "P2P_TRANSFER_TARGET",
         "P2P_TRANSFER_TARGET_CARD",
         "P2P_TRANSFER_SOURCE_CARD",
         "P2P_TRANSFER_SOURCE_CLABE",
         "P2P_TRANSFER_SOURCE",
         "IN_APP_PURCHASE_TAE",
         "IN_APP_PURCHASE_BILLPAYMENT",
         "QR_MERCHANT_PAYMENT",
         "GIFT_CARD_PURCHASE",
         "INTERNATIONAL_REMITTANCE_CASH_IN"}

TX_TYPE_MAP = {
    **{k: 0 for k in CASH_IN_TX},  # 0 = Cash_In
    **{k: 1 for k in SPEI_TX},     # 1 = SPEI
    **{k: 2 for k in P2P_TX}       # 2 = P2P
}

In [13]:
class FeatureBuilder:
    def __init__(self, cfg: Config):
        self.cfg = cfg
        self.ohe_states_: t.List[str] = []
        # Pipeline de normalización
        self.scaler_: RobustScaler | None = None
        self.numeric_cols_: t.List[str] = []
        self._fitted_scaler: bool = False

    @staticmethod
    def _state_to_abbr(s: t.Any) -> str:
        if pd.isna(s): return "OT"
        s = str(s)
        if s.upper() in REGION_BUCKET: return s.upper()
        s2 = _norm(s)
        s2 = STATE_SYNONYMS.get(s2, s2)
        return STATE_TO_ABBR.get(s2, "OT")

    @staticmethod
    def _canon_birthstate(s: t.Any) -> str:
        if pd.isna(s): return "OT"
        s = str(s).upper()
        return CANON.get(s, s if s in REGION_BUCKET else "OT")

    def _mk_time_feats(self, df: pd.DataFrame) -> pd.DataFrame:
        ts = pd.to_datetime(df[self.cfg.signup_ts_col], utc=True).dt.tz_convert(ZoneInfo(self.cfg.tz_local))
        df = df.copy()
        df["signup_dow"] = ts.dt.weekday.astype("int16")
        df["signup_week"] = ts.dt.isocalendar().week.astype("int16")
        df["signup_month"] = ts.dt.month.astype("int16")
        hr = ts.dt.hour
        df["signup_daypart"] = np.select([(hr>=5)&(hr<=11),(hr>=12)&(hr<=17)],[0,1],default=2).astype("int8")
        years = list({d.year for d in pd.to_datetime(df[self.cfg.signup_date_col]).dt.date})
        mx_hol = holidays.MX(years=years)
        dates = pd.to_datetime(df[self.cfg.signup_date_col]).dt.date
        df["is_holiday_mx"] = dates.map(lambda d: 1 if d in mx_hol else 0).astype("int8")
        day = ts.dt.day
        eom = (ts + pd.offsets.MonthEnd(0)).dt.day
        df["near_payday_any"] = ((np.abs(day-1)<=3)|(np.abs(day-15)<=3)|(np.abs(day-eom)<=3)).astype("int8")
        df["near_payday_1st"] = (np.abs(day-1)<=3).astype("int8")
        df["near_payday_15"]  = (np.abs(day-15)<=3).astype("int8")
        df["near_payday_eom"] = (np.abs(day-eom)<=3).astype("int8")
        return df

    def fit(self, df: pd.DataFrame):
        st = df["stateName"].map(self._state_to_abbr)
        self.ohe_states_ = sorted(st.dropna().unique().tolist())
        if "OT" not in self.ohe_states_: self.ohe_states_.append("OT")
        return self

    def transform(self, df: pd.DataFrame) -> Bunch:
        df = df.copy()
        s_ts = pd.to_datetime(df[CFG.signup_ts_col], utc=True)

        # Categóricas core
        df["gender_bin"] = df["gender"].map(GENDER_MAP).astype("float32")
        df["user_type_tri"] = df["user_type"].map(USER_TYPE_MAP).astype("float32")
        df["channel_detail_code"] = df["channelDetail"].map(CHANNEL_DETAIL_MAP).astype("float32")

        # birth bucket + edad
        bcanon = df["birthState"].map(self._canon_birthstate)
        df["birth_bucket"] = bcanon.map(REGION_BUCKET).astype("float32")
        bdate = pd.to_datetime(df["birth_date"], errors="coerce", utc=True)
        df["age_years"] = ((s_ts - bdate).dt.days/365.25).astype("float32")

        # Time features
        df = self._mk_time_feats(df)

        # state OHE
        st = df["stateName"].map(self._state_to_abbr)
        for ab in self.ohe_states_:
            df[f"state_{ab}"] = (st==ab).astype("int8")

        # Confirmaciones / flags: Int nulos a 0
        if "phone_conf_ts" in df.columns:
            phn_ts = pd.to_datetime(df["phone_conf_ts"], errors="coerce", utc=True)
            df["phn_confir"] = (phn_ts < s_ts).fillna(False).astype("int8")
        else:
            df["phn_confir"] = 0

        if "email_conf_ts" in df.columns:
            email_ts = pd.to_datetime(df["email_conf_ts"], errors="coerce", utc=True)
            df["email_confir"] = (email_ts < s_ts).fillna(False).astype("int8")
        else:
            df["email_confir"] = 0

        # Card_linked_date -> deltas sin fuga
        if "Card_linked_date" in df.columns:
            card_dt = pd.to_datetime(df["Card_linked_date"], errors="coerce", utc=True)
            # Cambiar <= por < (estrictamente antes)
            before = card_dt < s_ts  
            lag_days = (s_ts - card_dt).dt.days.astype("float32")
            df["card_linked_before_signup"] = before.fillna(False).astype("int8")
            # Solo crear lag_days si ocurrió ANTES (no <=)
            df["card_linked_lag_days"] = np.where(before, lag_days, np.nan).astype("float32")
            df = df.drop(columns=["Card_linked_date"])

        LEAKY_FEATURES = {
            #'lifespan_days',        
            #'days_since_last',
            'tx_30d_count',
            'tx_30d_from_activation',
            #'p_activation_30d', 
            'p_tx_0', 
            'p_tx_1', 
            'p_tx_2'
        }
            
        # Armar X
        drop_cols = set(LEAKY_ALWAYS) | {
            "stateName","gender","user_type","channelDetail","birthState","birth_date",
            CFG.signup_date_col, CFG.signup_ts_col
        } | set(EXTRA_DROP_TS) | LEAKY_FEATURES  

        drop_cols = [c for c in drop_cols if c in df.columns]
        X = df.drop(columns=drop_cols, errors="ignore")

        # Limpiar tipos
        # 1) Si queda algún dtype extension de BigQuery -> fuera
        bad_ext = [c for c in X.columns if "db_dtypes" in str(X[c].dtype).lower() or "dbdate" in str(X[c].dtype).lower()]
        X = X.drop(columns=bad_ext, errors="ignore")

        # 2) Casts seguros
        for c in X.columns:
            if pd.api.types.is_integer_dtype(X[c]) or str(X[c].dtype).startswith("Int"):
                X[c] = pd.to_numeric(X[c], errors="coerce").fillna(0).astype("float32")
            elif pd.api.types.is_float_dtype(X[c]):
                X[c] = X[c].astype("float32")
            elif pd.api.types.is_bool_dtype(X[c]):
                X[c] = X[c].astype("int8")
            elif pd.api.types.is_datetime64_any_dtype(X[c]):
                X = X.drop(columns=[c])

        # 3) Objetos -> fuera
        obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
        if obj_cols:
            X = X.drop(columns=obj_cols)

        # Normalización con RobustScaler
        if not self._fitted_scaler:
            # Primera vez: identificar columnas numéricas y fit scaler
            numeric_cols = [c for c in X.columns
                            if not c.startswith('state_')  # Excluir one-hot de estados
                            and X[c].dtype == 'float32'     # Solo float32
                            and X[c].nunique() > 10]        # Excluir binarias
            
            self.numeric_cols_ = numeric_cols
            
            if len(numeric_cols) > 0:
                self.scaler_ = RobustScaler()
                self.scaler_.fit(X[numeric_cols])
                self._fitted_scaler = True
                print(f'RobustScaler fitted con {len(numeric_cols)} features numéricas')
            else:
                print('No se encontraron features numéricas para normalizar')
                self._fitted_scaler = True
        
        # Aplicar normalización si existe
        if self.scaler_ is not None and len(self.numeric_cols_) > 0:
            existing_cols = [c for c in self.numeric_cols_ if c in X.columns]
            if len(existing_cols) > 0:
                X[existing_cols] = self.scaler_.transform(X[existing_cols])

        # y binaria
        y = pd.to_numeric(df[CFG.label_col], errors="coerce").fillna(0).astype(int).values
        
        # Verificación final de leakage
        assert_no_regex_leak(X)
        LEAK_BAN = LEAKY_ALWAYS  | {"label_5tx_30d"}
        #assert_no_labelish_cols(X)
        
        meta = pd.DataFrame({
            "user_id": df.get("user_id", pd.Series(index=df.index, dtype="object")),
            "signup_date": pd.to_datetime(df[CFG.signup_date_col], errors="coerce"),
            "gender": df["gender"].astype(str),
            "channelDetail": df["channelDetail"].astype(str),
            "state_abbr": st.astype(str)
        })
        return Bunch(X=X, y=y, meta=meta)

In [14]:
df_5tx = df[df["label_activated_30d"]==1].reset_index(drop=True)


print(f"Threshold de Activación aplicado: {CFG.activation_filter}")
print("Shape df_5tx (activados y high-score):", df_5tx.shape)
print("Rate 5tx en esta población:", df_5tx["label_5tx_30d"].mean())

df_5tx
# Construcción de features con FeatureBuilder
fb = FeatureBuilder(CFG).fit(df_5tx)
bunch = fb.transform(df_5tx)

X_all = bunch.X.values.astype("float32")         
y_all = df_5tx["label_5tx_30d"].astype(int).values  

signup_dates = pd.to_datetime(df_5tx[CFG.signup_date_col], errors="coerce")

print("X_all shape:", X_all.shape)
print("y_all rate (5tx):", y_all.mean())

Threshold de Activación aplicado: 1
Shape df_5tx (activados y high-score): (1470563, 37)
Rate 5tx en esta población: 0.6919791943629753
RobustScaler fitted con 8 features numéricas
X_all shape: (1470563, 64)
y_all rate (5tx): 0.6919791943629753


In [15]:
# MATRIZ DE CORRELACIÓN PEARSON + SPEARMAN CONTRA EL TARGET

# Convertimos X a DataFrame con nombres de features
if hasattr(bunch.X, "columns"):
    X_df = bunch.X.copy()
    cols = list(X_df.columns)
else:
    cols = getattr(bunch, "feature_names", [f"f_{i}" for i in range(bunch.X.shape[1])])
    X_df = pd.DataFrame(bunch.X, columns=cols)

# Target
y = df_5tx["label_5tx_30d"].astype(int)

# PEARSON
pearson_corr = X_df.apply(lambda col: col.corr(y, method="pearson"))

# SPEARMAN
spearman_corr = X_df.apply(lambda col: col.corr(y, method="spearman"))

# Compactamos en un solo DataFrame
corr_df = pd.DataFrame({
    "feature": cols,
    "pearson": pearson_corr.values,
    "spearman": spearman_corr.values
})

# Ordenar por la correlación absoluta más alta
corr_df["abs_pearson"] = corr_df["pearson"].abs()
corr_df["abs_spearman"] = corr_df["spearman"].abs()

corr_df_sorted = corr_df.sort_values(by=["abs_pearson", "abs_spearman"], ascending=False)

print("Top 20 features más correlacionadas con label_5tx_30d:")
display(corr_df_sorted.head(20))

# FLAGS DE ALERTA
THRESHOLD_ALERT = 0.60  # ajusta según el nivel de paranoia

suspicious = corr_df_sorted[
    (corr_df_sorted["abs_pearson"] > THRESHOLD_ALERT) |
    (corr_df_sorted["abs_spearman"] > THRESHOLD_ALERT)
]

if suspicious.shape[0] > 0:
    print("\n⚠️ ALERTA: Features con correlación ALTA con el target")
    display(suspicious)
else:
    print("\n✔️ No se detectaron features con correlación fuerte (Pearson o Spearman) contra el target.")


Top 20 features más correlacionadas con label_5tx_30d:


Unnamed: 0,feature,pearson,spearman,abs_pearson,abs_spearman
13,days_since_last,-0.299,-0.3378,0.299,0.3378
10,tx_30d_amount,0.2454,0.5535,0.2454,0.5535
12,lifespan_days,0.2388,0.2824,0.2388,0.2824
14,days_to_first_activation,-0.2371,-0.133,0.2371,0.133
0,userTypeIdentifier,0.2284,0.2206,0.2284,0.2206
16,user_type_tri,-0.2284,-0.2206,0.2284,0.2206
5,phn_confir_d7,0.1806,0.1806,0.1806,0.1806
19,age_years,-0.149,-0.1385,0.149,0.1385
1,accountLevel,0.1274,0.1288,0.1274,0.1288
7,both_confir_d7,0.1174,0.1174,0.1174,0.1174



✔️ No se detectaron features con correlación fuerte (Pearson o Spearman) contra el target.


# Models

In [16]:
# Orden cronológico y definición de train / embargo / holdout 
order = np.argsort(signup_dates.values)
signup_dates = signup_dates.iloc[order].reset_index(drop=True)
X_all = X_all[order]
y_all = y_all[order]

max_date = signup_dates.max()
holdout_start = max_date - pd.Timedelta(days=CFG.holdout_days)
embargo_end = holdout_start
train_end = holdout_start - pd.Timedelta(days=CFG.embargo_days)

train_mask = signup_dates <= train_end
holdout_mask = signup_dates > holdout_start
embargo_mask = (~train_mask) & (~holdout_mask)

print("Última fecha:", max_date.date())
print("Train hasta:", train_end.date())
print("Embargo entre:", train_end.date(), "y", holdout_start.date())
print("Holdout desde:", holdout_start.date())
print("N train   :", train_mask.sum())
print("N embargo :", embargo_mask.sum())
print("N holdout :", holdout_mask.sum())


Última fecha: 2025-10-21
Train hasta: 2025-10-04
Embargo entre: 2025-10-04 y 2025-10-07
Holdout desde: 2025-10-07
N train   : 1378919
N embargo : 16420
N holdout : 75224


In [17]:
X_train_full = X_all[train_mask.values]
y_train_full = y_all[train_mask.values]

n_train = X_train_full.shape[0]
split_idx = int(n_train * 0.8)

X_tr, X_val = X_train_full[:split_idx], X_train_full[split_idx:]
y_tr, y_val = y_train_full[:split_idx], y_train_full[split_idx:]

print("Train interno:", X_tr.shape[0], "obs")
print("Valid interno:", X_val.shape[0], "obs")


Train interno: 1103135 obs
Valid interno: 275784 obs


In [18]:
# Modelo: HistGradientBoosting con class_weight balanced ===
model_5tx =  LGBMClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    class_weight="balanced",
    n_jobs=-1,
    random_state=CFG.random_state,
    verbose=-1
)

model_5tx.fit(X_tr, y_tr)

p_val = model_5tx.predict_proba(X_val)[:, 1]

print("AP (valid):", average_precision_score(y_val, p_val))
print("AUC (valid):", roc_auc_score(y_val, p_val))
print("Brier (valid):", brier_score_loss(y_val, p_val))

AP (valid): 0.9831147859105813
AUC (valid): 0.961023505909501
Brier (valid): 0.0773091553929997


In [20]:
def objective_temporal_single(trial, model_name, X_tr, y_tr, X_val, y_val):
    """Objective con un solo split temporal train/val."""
    model = build_model(model_name, trial)

    model.fit(X_tr, y_tr)
    p_val = model.predict_proba(X_val)[:, 1]

    ap = average_precision_score(y_val, p_val)
    return ap

In [21]:
def lift_at_fracs(y_true: np.ndarray, y_score: np.ndarray, fracs) -> dict:
    """Devuelve capture_rate y lift para cada fracción."""
    order = np.argsort(-y_score)
    y_sorted = y_true[order]
    total_pos = y_true.sum()

    res = {}
    for f in fracs:
        k = max(1, int(len(y_true) * f))
        captured = y_sorted[:k].sum()
        if total_pos > 0:
            capture_rate = captured / total_pos
            lift = capture_rate / f
        else:
            capture_rate, lift = np.nan, np.nan
        res[f] = {
            "capture_rate": float(capture_rate),
            "lift": float(lift),
            "n_users": int(k),
        }
    return res

In [22]:
#  Evaluación en HOLDOUT 
X_holdout = X_all[holdout_mask.values]
y_holdout = y_all[holdout_mask.values]

p_holdout = model_5tx.predict_proba(X_holdout)[:, 1]

metrics_val = {
    "AP": float(average_precision_score(y_val, p_val)),
    "AUC": float(roc_auc_score(y_val, p_val)),
    "Brier": float(brier_score_loss(y_val, p_val)),
    "Lift": lift_at_fracs(y_val, p_val, CFG.lift_fracs),
}

metrics_holdout = {
    "AP": float(average_precision_score(y_holdout, p_holdout)),
    "AUC": float(roc_auc_score(y_holdout, p_holdout)),
    "Brier": float(brier_score_loss(y_holdout, p_holdout)),
    "Lift": lift_at_fracs(y_holdout, p_holdout, CFG.lift_fracs),
}

print("\n MÉTRICAS 5TX (VALID) ")
print(json.dumps(metrics_val, indent=2))
print("\n MÉTRICAS 5TX (HOLDOUT) ")
print(json.dumps(metrics_holdout, indent=2))


 MÉTRICAS 5TX (VALID) 
{
  "AP": 0.9831147859105813,
  "AUC": 0.961023505909501,
  "Brier": 0.0773091553929997,
  "Lift": {
    "0.01": {
      "capture_rate": 0.01409956121060868,
      "lift": 1.409956121060868,
      "n_users": 2757
    },
    "0.02": {
      "capture_rate": 0.028204236516687294,
      "lift": 1.4102118258343648,
      "n_users": 5515
    },
    "0.05": {
      "capture_rate": 0.0705131483394532,
      "lift": 1.4102629667890638,
      "n_users": 13789
    },
    "0.1": {
      "capture_rate": 0.1409956121060868,
      "lift": 1.409956121060868,
      "n_users": 27578
    }
  }
}

 MÉTRICAS 5TX (HOLDOUT) 
{
  "AP": 0.984328447199917,
  "AUC": 0.9630820049188185,
  "Brier": 0.0765567789159378,
  "Lift": {
    "0.01": {
      "capture_rate": 0.014037969721294032,
      "lift": 1.4037969721294032,
      "n_users": 752
    },
    "0.02": {
      "capture_rate": 0.028075939442588065,
      "lift": 1.4037969721294032,
      "n_users": 1504
    },
    "0.05": {
      "cap

In [28]:
# Esto genera el score para las 1.4M filas, respetando el orden cronológico ya establecido

# Predecir la probabilidad condicional para TODOS los usuarios activados (X_all)


p_cond_all = model_5tx.predict_proba(X_all)[:, 1]

# 2. Crear el DataFrame de scores copiando el original ordenado
df_5tx_scores = df_5tx.iloc[order].copy()

# 3. Asignar la probabilidad condicional (Modelo 2)
df_5tx_scores["p_5tx_30d"] = p_cond_all


# Guardar resultados (Artefacto de Producción)
# Seleccionamos solo IDs y scores para no hacer el archivo gigante
cols_export = ["user_id", "signup_date", "p_5tx_30d"]
df_export = df_5tx_scores[cols_export]


In [None]:
df_export

# MLOps example

In [None]:
def infer_minimal(dapath: str, artifacts_dir: dtr= "artifacts/")-> pd.DataFrame:
    data = pd.read_parquet(data_path) #Puede ser csv o la que guste team .csv
    
    #El transform de los features:
    featyre_builder = joblib.load(f"{artifacts_dir}/feature_builder.joblib")
    X= featyre_builder.transform(data).X
    
    #Model predict y ya aquí empecieza hacer inferencia
    
    model = joblib.load(f"{artifacts_dir}/model_tx.joblib")
    preds = model.predict_proba(X)[:,1]
    return pd.DataFrame({"user_id":data["user_id"],"score":preds})

In [None]:
class Model5txScorer:
    """
    Scorer precargado para inferencia en prod
    Uso:
    scorer = Model5txScorer.load("artifacts/")
    predictions = scorer.predict(new_data)
    
    """
    
    def __init__(self, model, feature_builder):
        self.model = model
        self.feature_builder = feature_builder
        
        @classmethod
        def load(cls, artifacts_dir: str = "artifacts/") -> "Model5txScorer":
            model = joblib.load(f"{artifacts_dir}/model_5tx.joblib")
            fb= joblib.load(f"{artifacts_dir}/feature_builder.joblib")
            return cls(model, fb)
        
        def predict(self, data:DataFrame)->:
            """
            data -> transform -> predict -> scores
            """

            X=self.feature_builder.transform(data).X
            scores = self.model.predict_proba(X)[:,1]
            return pd.DataFrame({
            "user_id": data["user_id"].values,
            "p_5tx_30d":scores
            })

In [None]:
if __name__ == "__main__":
    from google.cloud import bigquery
    import joblib
    
    # 1. READ DATA
    client = bigquery.Client()
    query = '''
        SELECT * FROM ``
        WHERE signup_date = CURRENT_DATE() - 1
    '''
    data = client.query(query).to_dataframe()
    
    # 2. TRANSFORM
    feature_builder = joblib.load("gs://bucket/artifacts/feature_builder.joblib")
    X = feature_builder.transform(data).X
    
    # 3. PREDICT
    model = joblib.load("gs://bucket/artifacts/model_5tx.joblib")
    preds = model.predict_proba(X)[:, 1]
    
    # 4. WRITE SCORES
    scores_df = pd.DataFrame({
        "user_id": data["user_id"],
        "p_5tx_30d": preds,
        "score_date": pd.Timestamp.now()
    })
    scores_df.to_gbq("project.dataset.scores_5tx", if_exists="append")