In [19]:
from pathlib import Path
import sys
PROJECT_ROOT = Path(r"C:\Users\luigu\OneDrive\Escritorio\ProyectoML_YouTube")
sys.path.append(str(PROJECT_ROOT))

DATA_PATH = PROJECT_ROOT / "data" / "youtube_data.csv"

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

from src.preprocessing import build_model_frame, make_preprocessor_prepub, make_target_hit_er
from src.models import make_classification_pipeline
from src.evaluation import evaluate_cv_prob, bin_metrics, precision_at_k

In [20]:
df_raw = pd.read_csv(DATA_PATH, low_memory=False)
df = build_model_frame(df_raw)

# Objetivo binario: top 10% ER por categoría × bucket de duración
y = make_target_hit_er(df, p=0.90)
y.value_counts(normalize=True)

er
0    0.898459
1    0.101541
Name: proportion, dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, stratify=y, random_state=42
)

pre_sup, _ = make_preprocessor_prepub(use_hashtags=False)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [22]:
models = {
    "logreg": make_classification_pipeline(pre_sup, model="logreg"),
    "rf":     make_classification_pipeline(pre_sup, model="rf"),
    "xgb":    make_classification_pipeline(pre_sup, model="xgb"),
}

rows = []
for name, clf in models.items():
    m = evaluate_cv_prob(clf, X_train, y_train, cv=cv)
    m["model"] = name
    rows.append(m)

baseline_df = pd.DataFrame(rows).set_index("model").sort_values("auc_pr", ascending=False)
baseline_df

Unnamed: 0_level_0,auc_roc,auc_pr,p_at_10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
logreg,0.567238,0.137923,0.173419
xgb,0.565373,0.136853,0.165601
rf,0.561627,0.12348,0.139303


In [23]:
# LOGREG — GridSearchCV 
logreg = make_classification_pipeline(pre_sup, model="logreg")
param_grid_logreg = {
    "model__C": np.logspace(-2, 2, 7)  # 0.01..100
}
gs_logreg = GridSearchCV(
    estimator=logreg, param_grid=param_grid_logreg,
    scoring="average_precision", cv=cv, n_jobs=-1, refit=True
)

# RANDOM FOREST — RandomizedSearchCV 
rf = make_classification_pipeline(pre_sup, model="rf")
param_dist_rf = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [None, 8, 12, 16, 24],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", None],
}
rs_rf = RandomizedSearchCV(
    estimator=rf, param_distributions=param_dist_rf,
    n_iter=25, scoring="average_precision", cv=cv, n_jobs=-1,
    random_state=42, refit=True
)

# XGBOOST — RandomizedSearchCV 
xgb = make_classification_pipeline(pre_sup, model="xgb")
param_dist_xgb = {
    "model__n_estimators": [300, 400, 600, 800],
    "model__max_depth": [4, 6, 8],
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__subsample": [0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
    "model__reg_lambda": [0.0, 1.0, 5.0, 10.0],
}
rs_xgb = RandomizedSearchCV(
    estimator=xgb, param_distributions=param_dist_xgb,
    n_iter=30, scoring="average_precision", cv=cv, n_jobs=-1,
    random_state=42, refit=True, error_score=0  # ← cambio
)

In [24]:
_ = gs_logreg.fit(X_train, y_train)
_ = rs_rf.fit(X_train, y_train)
_ = rs_xgb.fit(X_train, y_train)

gs_logreg.best_params_, rs_rf.best_params_, rs_xgb.best_params_

1 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\luigu\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\luigu\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\luigu\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:

({'model__C': np.float64(0.21544346900318834)},
 {'model__n_estimators': 200,
  'model__min_samples_split': 10,
  'model__min_samples_leaf': 4,
  'model__max_features': 'sqrt',
  'model__max_depth': 16},
 {'model__subsample': 1.0,
  'model__reg_lambda': 1.0,
  'model__n_estimators': 400,
  'model__max_depth': 4,
  'model__learning_rate': 0.05,
  'model__colsample_bytree': 0.8})

In [25]:
tuned = {
    "logreg_tuned": gs_logreg.best_estimator_,
    "rf_tuned": rs_rf.best_estimator_,
    "xgb_tuned": rs_xgb.best_estimator_,
}

rows = []
for name, clf in tuned.items():
    m = evaluate_cv_prob(clf, X_train, y_train, cv=cv)
    m["model"] = name
    rows.append(m)

tuned_df = pd.DataFrame(rows).set_index("model").sort_values("auc_pr", ascending=False)
tuned_df

Unnamed: 0_level_0,auc_roc,auc_pr,p_at_10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
xgb_tuned,0.593225,0.148521,0.181237
rf_tuned,0.584535,0.143806,0.181237
logreg_tuned,0.572427,0.140731,0.176262


In [26]:
def evaluate_on_test(clf, X_tr, y_tr, X_te, y_te):
    clf.fit(X_tr, y_tr)
    y_prob = clf.predict_proba(X_te)[:,1]
    out = bin_metrics(y_te, y_prob)
    out["p_at_10"] = precision_at_k(y_te, y_prob, k=0.10)
    return out

# Baselines
test_rows = []
for name, clf in models.items():
    test_rows.append({"model": name, **evaluate_on_test(clf, X_train, y_train, X_test, y_test)})

# Tuned
for name, clf in tuned.items():
    test_rows.append({"model": name, **evaluate_on_test(clf, X_train, y_train, X_test, y_test)})

test_df = pd.DataFrame(test_rows).set_index("model").sort_values("auc_pr", ascending=False)
test_df

Unnamed: 0_level_0,auc_roc,auc_pr,p_at_10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rf_tuned,0.578434,0.14199,0.181818
xgb_tuned,0.585029,0.132234,0.144886
logreg_tuned,0.566414,0.131783,0.161932
xgb,0.566004,0.127087,0.144886
logreg,0.549928,0.124853,0.144886
rf,0.526445,0.107574,0.110795


In [27]:
# Construimos un resumen limpio por "modelo base" (logreg, rf, xgb) y etapa (baseline_cv, tuned_cv, holdout_test)
def base_name(m: str) -> str:
    return m.replace("_tuned", "")

rows = []

# baseline_cv
for m in baseline_df.index:
    r = baseline_df.loc[m].to_dict()
    rows.append({"base": m, "stage": "baseline_cv", **r})

# tuned_cv
for m in tuned_df.index:
    r = tuned_df.loc[m].to_dict()
    rows.append({"base": base_name(m), "stage": "tuned_cv", **r})

# holdout_test (incluye baselines y tuned)
for m in test_df.index:
    r = test_df.loc[m].to_dict()
    rows.append({"base": base_name(m), "stage": "holdout_test", **r})

long_df = pd.DataFrame(rows)

# Pivot ordenado: métricas (filas = modelo base; columnas = etapa)
clean_summary = long_df.pivot_table(
    index="base", columns="stage", values=["auc_pr", "auc_roc", "p_at_10"]
).sort_index()

clean_summary

Unnamed: 0_level_0,auc_pr,auc_pr,auc_pr,auc_roc,auc_roc,auc_roc,p_at_10,p_at_10,p_at_10
stage,baseline_cv,holdout_test,tuned_cv,baseline_cv,holdout_test,tuned_cv,baseline_cv,holdout_test,tuned_cv
base,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
logreg,0.137923,0.128318,0.140731,0.567238,0.558171,0.572427,0.173419,0.153409,0.176262
rf,0.12348,0.124782,0.143806,0.561627,0.55244,0.584535,0.139303,0.146307,0.181237
xgb,0.136853,0.129661,0.148521,0.565373,0.575516,0.593225,0.165601,0.144886,0.181237


In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Combina baselines y tuned (reutiliza 'models' y 'tuned' definidos antes)
all_models = {}
all_models.update(models)        # {'logreg', 'rf', 'xgb'}
all_models.update(tuned)         # {'logreg_tuned', 'rf_tuned', 'xgb_tuned'}

rows = []
for name, clf in all_models.items():
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:, 1]
    y_hat = (y_prob >= 0.5).astype(int)  # umbral 0.5 para métricas de clase

    rows.append({
        "model": name,
        "accuracy": accuracy_score(y_test, y_hat),
        "precision": precision_score(y_test, y_hat, zero_division=0),
        "recall": recall_score(y_test, y_hat, zero_division=0),
        "f1": f1_score(y_test, y_hat, zero_division=0),
        "auc_roc": roc_auc_score(y_test, y_prob),
    })

metrics_holdout = pd.DataFrame(rows).set_index("model").sort_values("auc_roc", ascending=False)
metrics_holdout

Unnamed: 0_level_0,accuracy,precision,recall,f1,auc_roc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
xgb_tuned,0.897385,0.0,0.0,0.0,0.585029
rf_tuned,0.762365,0.146233,0.277311,0.191489,0.578434
logreg_tuned,0.679648,0.138837,0.414566,0.208011,0.566414
xgb,0.893974,0.055556,0.002801,0.005333,0.566004
logreg,0.681353,0.132692,0.386555,0.197566,0.549928
rf,0.895395,0.076923,0.002801,0.005405,0.526445


In [30]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def best_f1_metrics(clf, X_tr, y_tr, X_te, y_te):
    clf.fit(X_tr, y_tr)
    y_prob = clf.predict_proba(X_te)[:, 1]
    auc = roc_auc_score(y_te, y_prob)
    # barrido de umbrales
    ths = np.linspace(0.01, 0.99, 99)
    rows = []
    for t in ths:
        y_hat = (y_prob >= t).astype(int)
        rows.append({
            "thr": t,
            "f1": f1_score(y_te, y_hat, zero_division=0),
            "precision": precision_score(y_te, y_hat, zero_division=0),
            "recall": recall_score(y_te, y_hat, zero_division=0),
            "accuracy": accuracy_score(y_te, y_hat),
        })
    dfm = pd.DataFrame(rows)
    best = dfm.iloc[dfm["f1"].values.argmax()].to_dict()
    return {
        "best_thr": float(best["thr"]),
        "f1": float(best["f1"]),
        "precision": float(best["precision"]),
        "recall": float(best["recall"]),
        "accuracy": float(best["accuracy"]),
        "auc_roc": float(auc),
    }

all_models = {}
all_models.update(models)   # {'logreg','rf','xgb'}
all_models.update(tuned)    # {'logreg_tuned','rf_tuned','xgb_tuned'}

rows = []
for name, clf in all_models.items():
    rows.append({"model": name, **best_f1_metrics(clf, X_train, y_train, X_test, y_test)})
best_by_f1 = pd.DataFrame(rows).set_index("model").sort_values("f1", ascending=False)
best_by_f1

Unnamed: 0_level_0,best_thr,f1,precision,recall,accuracy,auc_roc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
logreg_tuned,0.51,0.208075,0.143931,0.37535,0.710063,0.566414
xgb_tuned,0.09,0.208036,0.123739,0.652661,0.495736,0.585029
rf_tuned,0.49,0.205011,0.140625,0.378151,0.702388,0.578434
logreg,0.52,0.203474,0.144366,0.344538,0.726265,0.549928
xgb,0.09,0.198714,0.125554,0.47619,0.61029,0.566004
rf,0.03,0.1867,0.10506,0.837535,0.259522,0.526445


In [31]:
def metrics_at_topk(y_true, y_prob, k=0.10):
    n = len(y_true)
    top = max(1, int(round(k * n)))
    idx = np.argsort(-y_prob)[:top]
    y_hat = np.zeros_like(y_true)
    y_hat[idx] = 1
    return {
        "k_share": k,
        "precision_at_k": float((y_true[idx] == 1).mean()),
        "recall_at_k": float((y_true[idx] == 1).sum() / (y_true == 1).sum()),
        "f1_at_k": float(f1_score(y_true, y_hat, zero_division=0)),
    }

rows = []
for name, clf in all_models.items():
    clf.fit(X_train, y_train)
    y_prob = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
    mk = metrics_at_topk(y_test.values, y_prob, k=0.10)
    rows.append({"model": name, "auc_roc": auc, **mk})

at_top10 = pd.DataFrame(rows).set_index("model").sort_values("precision_at_k", ascending=False)
at_top10

Unnamed: 0_level_0,auc_roc,k_share,precision_at_k,recall_at_k,f1_at_k
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rf_tuned,0.578434,0.1,0.181818,0.179272,0.180536
logreg_tuned,0.566414,0.1,0.161932,0.159664,0.16079
xgb,0.566004,0.1,0.144886,0.142857,0.143865
logreg,0.549928,0.1,0.144886,0.142857,0.143865
xgb_tuned,0.585029,0.1,0.144886,0.142857,0.143865
rf,0.526445,0.1,0.110795,0.109244,0.110014
