# **XGBoost**



In [1]:
# Imports 
import os, json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier
import xgboost as xgb
pd.set_option("display.max_columns", None)


### **Carga del *data frame***

In [None]:
import csv

def load_table(path):
    # intento estándar (TSV con engine C)
    try:
        return pd.read_csv(path, sep='\t', engine='c', low_memory=False)
    except Exception as e1:
        print(f"[1] Falló TSV con engine='c': {e1}")
    # fallback: TSV con engine python (sin low_memory!)
    try:
        return pd.read_csv(path, sep='\t', engine='python',
                           quoting=csv.QUOTE_NONE,
                           on_bad_lines='warn',
                           encoding='utf-8')
    except Exception as e2:
        print(f"[2] Falló TSV con engine='python': {e2}")
    # último recurso: autodetectar delimitador
    with open(path, 'r', encoding='utf-8', errors='replace') as f:
        head = ''.join([next(f) for _ in range(10)])
    sniff = csv.Sniffer().sniff(head, delimiters=[',','\t',';'])
    delim = sniff.delimiter
    print(f"[3] Sniffer detectó delimitador: {repr(delim)}")
    return pd.read_csv(path, sep=delim, engine='python',
                       quoting=csv.QUOTE_NONE,
                       on_bad_lines='warn',
                       encoding='utf-8')

# Parámetros
DATA_DIR = '.'
SPOTIFY_JSON_DIR = './spotify_api_data'
OUT_CSV = 'submission_xgb_formato_clase.csv'
SEED = 123

train_data = os.path.join(DATA_DIR, 'data', 'train_data.txt')
test_data  = os.path.join(DATA_DIR, 'data', 'test_data.txt')

train_df = load_table(train_data)
test_df  = load_table(test_data)

print(train_df.shape, test_df.shape)
train_df.head(3)


[1] Falló TSV con engine='c': Error tokenizing data. C error: Expected 21 fields in line 15, saw 42



### **Valores faltantes**

In [None]:
# Quick NA check
na_train = train_df.isna().mean().sort_values(ascending=False).head(20)
na_test  = test_df.isna().mean().sort_values(ascending=False).head(20)
display(pd.DataFrame({'train_na%': na_train, 'test_na%': na_test}).fillna(0).round(3))


Unnamed: 0,train_na%,test_na%
audiobook_chapter_title,1.0,1.0
audiobook_chapter_uri,1.0,1.0
audiobook_title,1.0,1.0
audiobook_uri,1.0,1.0
conn_country,0.0,0.0
episode_name,1.0,0.997
episode_show_name,1.0,0.997
incognito_mode,0.0,0.0
ip_addr,0.0,0.0
master_metadata_album_album_name,0.0,0.003


### **Preparar conjuntos de entrenamiento, validación (*hold-out*) y evaluación**

In [None]:
import os
import pandas as pd

CANON = [
    "ts","platform","conn_country","ip_addr",
    "master_metadata_track_name","master_metadata_album_artist_name","master_metadata_album_album_name",
    "spotify_track_uri","episode_name","episode_show_name","spotify_episode_uri",
    "audiobook_title","audiobook_uri","audiobook_chapter_uri","audiobook_chapter_title",
    "reason_end","shuffle","offline","offline_timestamp","incognito_mode","username","obs_id"
]
CANON_TEST = [c for c in CANON if c != "reason_end"]

def _clean_to_ncols(in_path, out_path, ncols, header_names):
    """Deja solo filas con exactamente ncols columnas (separadas por TAB)
       y escribe header_names como encabezado."""
    kept = 0; skipped = 0
    with open(in_path, 'r', encoding='utf-8', errors='replace') as fin, \
         open(out_path, 'w', encoding='utf-8', newline='') as fout:
        fout.write('\t'.join(header_names) + '\n')
        # saltamos la primera línea original (sea header o no)
        first = fin.readline()
        for line in fin:
            row = line.rstrip('\n').split('\t')
            if len(row) == ncols:
                fout.write('\t'.join(row) + '\n'); kept += 1
            else:
                skipped += 1
    return kept, skipped

def _load_tsv_fixed(path_in, expect_labels=True):
    ncols = 22 if expect_labels else 21
    names = CANON if expect_labels else CANON_TEST
    path_out = path_in.replace('.txt', f'_clean_{ncols}.tsv')
    kept, skipped = _clean_to_ncols(path_in, path_out, ncols, names)
    print(f"[{os.path.basename(path_in)}] OK: {kept} | saltadas: {skipped} | cols: {ncols}")
    df = pd.read_csv(path_out, sep='\t', engine='c')
    # sanity final
    assert list(df.columns) == names, "Columnas inesperadas tras limpieza."
    return df

# rutas base
train_path = os.path.join(DATA_DIR, 'data', 'train_data.txt')
test_path  = os.path.join(DATA_DIR, 'data', 'test_data.txt')

# 1) intentamos cargar con el formato esperado (train=22, test=21)
train_df = _load_tsv_fixed(train_path, expect_labels=True)
test_df  = _load_tsv_fixed(test_path,  expect_labels=False)

# 2) si por algún motivo train NO tiene reason_end, probamos swap automático
if 'reason_end' not in train_df.columns and 'reason_end' in test_df.columns:
    print(" Detectado swap train/test. Intercambiando…")
    # recargar al revés
    train_df = _load_tsv_fixed(test_path,  expect_labels=True)
    test_df  = _load_tsv_fixed(train_path, expect_labels=False)

# 3) ahora sí, target
if 'reason_end' not in train_df.columns:
    raise ValueError("No encuentro 'reason_end' en el train tras la limpieza. Verificá que el train sea el de entrenamiento.")
train_df['target'] = (train_df['reason_end'].astype(str).str.strip().str.lower() == 'fwdbtn').astype('int8')

print("Shapes:", train_df.shape, test_df.shape)
display(train_df.head(3))


[train_data.txt] OK: 911329 | saltadas: 14 | cols: 22


  df = pd.read_csv(path_out, sep='\t', engine='c')


[test_data.txt] OK: 51570 | saltadas: 0 | cols: 21
Shapes: (911329, 23) (51570, 21)


Unnamed: 0,ts,platform,conn_country,ip_addr,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,audiobook_title,audiobook_uri,audiobook_chapter_uri,audiobook_chapter_title,reason_end,shuffle,offline,offline_timestamp,incognito_mode,username,obs_id,target
0,2013-10-30 19:20:00+00:00,Windows 7 (6.1.7601; x86; SP1; S),AR,6472d74d7192fecaa2744625ea9e29285bde602e641a03...,The Eater Of Dreams,Nine Inch Nails,Hesitation Marks,spotify:track:1IPdwxRUbuNZiRpFN49RQC,,,,,,,,fwdbtn,False,False,,False,4324517c6925bba98b4e3a6896d1398fae8f777969e7bc...,16,1
1,2013-10-30 19:20:00+00:00,Windows 7 (6.1.7601; x86; SP1; S),AR,6472d74d7192fecaa2744625ea9e29285bde602e641a03...,Copy Of A,Nine Inch Nails,Hesitation Marks,spotify:track:4BFKCEp4gwG3QHNlYodLMy,,,,,,,,trackdone,False,False,,False,4324517c6925bba98b4e3a6896d1398fae8f777969e7bc...,17,0
2,2013-10-30 19:40:00+00:00,Windows 7 (6.1.7601; x86; SP1; S),AR,6472d74d7192fecaa2744625ea9e29285bde602e641a03...,All Time Low,Nine Inch Nails,Hesitation Marks,spotify:track:7r8raRQN9BxUIs5op8GG8j,,,,,,,,trackdone,False,False,,False,4324517c6925bba98b4e3a6896d1398fae8f777969e7bc...,18,0


### **XGBoost**

In [None]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import pandas as pd

# columnas categóricas y numéricas
cat_cols = ['platform','conn_country','ip_addr',
            'master_metadata_track_name','master_metadata_album_artist_name',
            'master_metadata_album_album_name','username']
cat_cols = [c for c in cat_cols if c in train_df.columns]

num_cols = ['hour','dayofweek','month','shuffle','offline','incognito_mode',
            'has_track','has_episode','has_audiobook',
            'epi_duration_ms','epi_explicit','epi_langs_n','epi_release_year']
num_cols = [c for c in num_cols if c in train_df.columns]

# rellenar NaN en categóricas con un token fijo
for c in cat_cols:
    train_df[c] = train_df[c].fillna("<NA>")
    test_df[c]  = test_df[c].fillna("<NA>")

# encoder ordinal
enc = OrdinalEncoder(
    handle_unknown='use_encoded_value',
    unknown_value=-1,
    dtype=np.int32
)
enc.fit(pd.concat([train_df[cat_cols], test_df[cat_cols]], axis=0) if cat_cols else pd.DataFrame())

def build_matrix(df: pd.DataFrame):
    Xc = enc.transform(df[cat_cols]).astype(np.float32) if cat_cols else np.empty((len(df),0), dtype=np.float32)
    Xn = df[num_cols].to_numpy(dtype=np.float32) if num_cols else np.empty((len(df),0), dtype=np.float32)
    return np.hstack([Xc, Xn]).astype(np.float32)

# features y target
X = build_matrix(train_df)
Xt = build_matrix(test_df)
y = train_df['target'].to_numpy(dtype=np.int32)

# grupos (por usuario)
groups = train_df['username'].fillna('NA').astype(str).to_numpy()

print("Shapes:", X.shape, Xt.shape, y.shape, "| cat:", len(cat_cols), "num:", len(num_cols))


Shapes: (911329, 10) (51570, 10) (911329,) | cat: 7 num: 3


### **Búsqueda de hiperparámetros**

In [None]:
DO_SEARCH = False

if DO_SEARCH:
    param_dist = {
        'n_estimators': [600, 900, 1200, 1500],
        'learning_rate': [0.03, 0.05, 0.07, 0.1],
        'max_depth': [4, 6, 8],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'reg_lambda': [0.5, 1.0, 2.0],
    }
    base = XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        random_state=SEED,
        n_jobs=-1,
    )
    search = RandomizedSearchCV(
        base, param_distributions=param_dist,
        n_iter=12, scoring='roc_auc', cv=3,
        random_state=SEED, verbose=1, n_jobs=-1
    )
    search.fit(X, y)
    print("Best params:", search.best_params_)
    print("Best CV AUC:", search.best_score_)
    xgb = search.best_estimator_


### **Conjunto de test**

In [None]:
# 1) split por usuario (hold-out 20%)
gss = GroupShuffleSplit(n_splits=1, train_size=0.80, random_state=SEED)
tr_idx, va_idx = next(gss.split(X, y, groups=groups))
X_tr, y_tr = X[tr_idx], y[tr_idx]
X_va, y_va = X[va_idx], y[va_idx]

# 2) pesos por desbalance (por si fwdbtn es minoría)
pos_ratio = float(y.mean())
scale_pos = (1.0 - pos_ratio) / max(pos_ratio, 1e-6)

# 3) params XGBoost (baseline sólido)
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "hist",
    "eta": 0.06,
    "max_depth": 6,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "scale_pos_weight": scale_pos,
    "seed": SEED,
}

# 4) DMatrix + train con early stopping
dtr = xgb.DMatrix(X_tr, label=y_tr)
dva = xgb.DMatrix(X_va, label=y_va)
watchlist = [(dtr, "train"), (dva, "valid")]

bst = xgb.train(
    params=params,
    dtrain=dtr,
    num_boost_round=1200,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=False,
)

# 5) AUC validación usando la mejor iteración, compatible 1.x/2.x
def _predict_best(bst, dmat):
    # xgboost >=2.0: iteration_range
    if hasattr(bst, "best_iteration") and bst.best_iteration is not None:
        return bst.predict(dmat, iteration_range=(0, bst.best_iteration + 1))
    # xgboost 1.x: best_ntree_limit
    if hasattr(bst, "best_ntree_limit") and bst.best_ntree_limit:
        return bst.predict(dmat, ntree_limit=bst.best_ntree_limit)
    return bst.predict(dmat)

val_pred = _predict_best(bst, dva)
val_auc = roc_auc_score(y_va, val_pred)
best_round = (bst.best_iteration + 1) if hasattr(bst, "best_iteration") and bst.best_iteration is not None else getattr(bst, "best_ntree_limit", bst.best_score if hasattr(bst, "best_score") else 1200)
print(f"AUC validación: {val_auc:.5f} | best_round={best_round}")

# 6) Reentrenar con la mejor cantidad de árboles
dfull = xgb.DMatrix(X, label=y)
bst_full = xgb.train(
    params=params,
    dtrain=dfull,
    num_boost_round=int(best_round) if isinstance(best_round, (int, np.integer)) else 1200,
    verbose_eval=False,
)

# 7) Predicción a test + submit
dtest = xgb.DMatrix(Xt)
test_proba = _predict_best(bst_full, dtest)
sub = pd.DataFrame({"obs_id": test_df["obs_id"].astype("int64"), "pred_proba": test_proba}).sort_values("obs_id")
sub.to_csv(OUT_CSV, index=False)
print(f"Submit guardado en: {OUT_CSV} | filas: {len(sub)}")
sub.head()


AUC validación: 0.57519 | best_round=90
✅ Submit guardado en: submission_xgb_formato_clase.csv | filas: 51570


Unnamed: 0,obs_id,pred_proba
0,911345,0.730681
1,911346,0.338616
2,911347,0.089688
3,911348,0.089068
4,911349,0.089125
