# **0. Configuration**

In [24]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.metrics import log_loss, roc_auc_score, accuracy_score

from sklearn.model_selection import train_test_split

from sklearn.inspection import permutation_importance

RANDOM_STATE = 42
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

# **1. Load Data and Basic Structure**

## **1.1 Loading data**

In [5]:
df = pd.read_csv("Audiobooks_data.csv", header = None)

In [7]:
df.columns = [
    "ID",
    "Book_length_mins_overall",
    "Book_length_mins_avg",
    "Price_overall",
    "Price_avg",
    "Review",
    "Review 10/10",
    "Minutes_listened",
    "Completion",
    "Support_requests",
    "Last_visited_Minus_Purchase_date",
    "Targets",
]

print(df.head())
print(df.columns.tolist())

    ID  Book_length_mins_overall  Book_length_mins_avg  Price_overall  Price_avg  Review  Review 10/10  \
0  873                    2160.0                  2160          10.13      10.13       0          8.91   
1  611                    1404.0                  2808           6.66      13.33       1          6.50   
2  705                     324.0                   324          10.13      10.13       1          9.00   
3  391                    1620.0                  1620          15.31      15.31       0          9.00   
4  819                     432.0                  1296           7.11      21.33       1          9.00   

   Minutes_listened  Completion  Support_requests  Last_visited_Minus_Purchase_date  Targets  
0               0.0         0.0                 0                                 0        1  
1               0.0         0.0                 0                               182        1  
2               0.0         0.0                 1                             

In [8]:
TARGET_COL = "Targets"

FEATURES = [
    "Book_length_mins_overall",
    "Book_length_mins_avg",
    "Price_overall",
    "Price_avg",
    "Review",
    "Review 10/10",
    "Minutes_listened",
    "Completion",
    "Support_requests",
    "Last_visited_Minus_Purchase_date",
]

REQUIRED_COLS = [TARGET_COL] + FEATURES

missing_cols = [c for c in REQUIRED_COLS if c not in df.columns]
if missing_cols:
    raise ValueError(f"Faltan columnas requeridas en df: {missing_cols}")

# Chequeos rápidos
print("Shape:", df.shape)
print("\nTargets value counts:")
print(df[TARGET_COL].value_counts(dropna=False))

# Revisa nulos en features
print("\nNulos por columna (features):")
print(df[FEATURES].isna().sum().sort_values(ascending=False))

# Asegura target binario (0/1)
# Si Targets viene como True/False o 'yes'/'no', conviértelo explícitamente.
unique_targets = df[TARGET_COL].dropna().unique()
if len(unique_targets) > 2:
    raise ValueError(f"Targets no parece binario. Valores únicos: {unique_targets}")


Shape: (14084, 12)

Targets value counts:
Targets
0    11847
1     2237
Name: count, dtype: int64

Nulos por columna (features):
Book_length_mins_overall            0
Book_length_mins_avg                0
Price_overall                       0
Price_avg                           0
Review                              0
Review 10/10                        0
Minutes_listened                    0
Completion                          0
Support_requests                    0
Last_visited_Minus_Purchase_date    0
dtype: int64


# **2. Identify Temporal Column**

In [9]:
def find_datetime_candidates(dataframe: pd.DataFrame):
    candidates = []
    # 1) columnas ya en datetime
    for col in dataframe.columns:
        if pd.api.types.is_datetime64_any_dtype(dataframe[col]):
            candidates.append(col)
    # 2) columnas tipo object con nombres sugerentes
    name_hints = ("date", "time", "timestamp", "ts", "created", "purchase", "visit", "event")
    for col in dataframe.select_dtypes(include=["object"]).columns:
        if any(h in col.lower() for h in name_hints):
            candidates.append(col)
    return sorted(set(candidates))

datetime_candidates = find_datetime_candidates(df)
print("\nCandidatas a columna de tiempo:", datetime_candidates)

# Si eliges una (por ejemplo, "Purchase_date"), descomenta y ajusta:
TIME_COL = None  # e.g., "Purchase_date"

if TIME_COL is not None:
    # Intentar convertir a datetime si no lo es
    if not pd.api.types.is_datetime64_any_dtype(df[TIME_COL]):
        df[TIME_COL] = pd.to_datetime(df[TIME_COL], errors="coerce")

    # Si hay muchos NaT, cuidado
    nat_rate = df[TIME_COL].isna().mean()
    print(f"\nNaT rate en {TIME_COL}: {nat_rate:.3f}")
    if nat_rate > 0.2:
        print("ADVERTENCIA: muchos valores no convertibles a fecha; split temporal puede ser poco fiable.")

    df = df.sort_values(TIME_COL)


Candidatas a columna de tiempo: []


# **3. Features Selection**

In [10]:
X = df[FEATURES].copy()
y = df[TARGET_COL].copy()

# **4. Split if Temporal Column Exists if not Stratified**

In [11]:
TEST_SIZE = 0.2

if TIME_COL is not None:
    # Holdout temporal: último 20% como test
    n = len(df)
    split_idx = int((1 - TEST_SIZE) * n)

    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    print("\nSplit temporal aplicado.")
    print("Train period:", df[TIME_COL].iloc[:split_idx].min(), "->", df[TIME_COL].iloc[:split_idx].max())
    print("Test  period:", df[TIME_COL].iloc[split_idx:].min(), "->", df[TIME_COL].iloc[split_idx:].max())
else:
    # Split estratificado (mantiene proporción de clases)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print("\nSplit estratificado aplicado.")


Split estratificado aplicado.


# **5. Preprocessing**

In [12]:
numeric_features = [c for c in FEATURES if pd.api.types.is_numeric_dtype(df[c])]
categorical_features = [c for c in FEATURES if c not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    # Sin OneHotEncoder para mantenerlo simple si no aplica.
    # Si detectas categóricas de verdad, activa OneHotEncoder:
    # ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

print("\nNuméricas:", numeric_features)
print("Categóricas:", categorical_features)



Numéricas: ['Book_length_mins_overall', 'Book_length_mins_avg', 'Price_overall', 'Price_avg', 'Review', 'Review 10/10', 'Minutes_listened', 'Completion', 'Support_requests', 'Last_visited_Minus_Purchase_date']
Categóricas: []


# **6. Baseline**

In [13]:
baseline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", DummyClassifier(strategy="prior", random_state=RANDOM_STATE))
])

# **7. Initial Model**

In [14]:
# (A) Regresión logística (interpretable, buen baseline fuerte)
logreg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(
        max_iter=2000,
        solver="lbfgs",
        class_weight=None,   # si hay desbalance fuerte, puedes probar "balanced"
        random_state=RANDOM_STATE
    ))
])

# (B) Modelo más potente: Gradient Boosting (tabular, no requiere escalado, pero aquí ya está en pipeline)
hgb = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_iter=400,
        random_state=RANDOM_STATE
    ))
])

# **8. Evaluation**

In [15]:
def evaluate_model(name, pipe, X_tr, y_tr, X_te, y_te):
    pipe.fit(X_tr, y_tr)

    # Probabilidades
    p_tr = pipe.predict_proba(X_tr)[:, 1]
    p_te = pipe.predict_proba(X_te)[:, 1]

    # Predicción 0/1 por umbral 0.5 (solo para accuracy)
    yhat_tr = (p_tr >= 0.5).astype(int)
    yhat_te = (p_te >= 0.5).astype(int)

    results = {
        "model": name,
        "logloss_train": log_loss(y_tr, p_tr, labels=[0, 1]),
        "logloss_test":  log_loss(y_te, p_te, labels=[0, 1]),
        "auc_train": roc_auc_score(y_tr, p_tr) if len(np.unique(y_tr)) == 2 else np.nan,
        "auc_test":  roc_auc_score(y_te, p_te) if len(np.unique(y_te)) == 2 else np.nan,
        "acc_train": accuracy_score(y_tr, yhat_tr),
        "acc_test":  accuracy_score(y_te, yhat_te),
    }
    return results

results = []
for name, model in [
    ("Baseline (prior)", baseline),
    ("LogisticRegression", logreg),
    ("HistGradientBoosting", hgb),
]:
    results.append(evaluate_model(name, model, X_train, y_train, X_test, y_test))

results_df = pd.DataFrame(results).sort_values("logloss_test")
print("\nRESULTADOS (ordenados por logloss_test, menor es mejor):")
print(results_df.to_string(index=False))


RESULTADOS (ordenados por logloss_test, menor es mejor):
               model  logloss_train  logloss_test  auc_train  auc_test  acc_train  acc_test
HistGradientBoosting       0.195743      0.205691   0.939461  0.925533   0.921452  0.919063
  LogisticRegression       0.250158      0.241475   0.895807  0.903322   0.901660  0.904153
    Baseline (prior)       0.437793      0.437474   0.500000  0.500000   0.841129  0.841321


# **9. DETECTAR OVERFITTING (train vs test)**

In [16]:
# Señal típica: logloss_train mucho menor que logloss_test (gap grande).
results_df["logloss_gap"] = results_df["logloss_test"] - results_df["logloss_train"]
print("\nGap de log-loss (test - train):")
print(results_df[["model", "logloss_train", "logloss_test", "logloss_gap"]].sort_values("logloss_gap", ascending=False).to_string(index=False))

# Heurística simple (ajústala): gap > 0.05 puede ser sospechoso en muchos problemas
SUSPECT_GAP = 0.05
suspects = results_df[results_df["logloss_gap"] > SUSPECT_GAP]["model"].tolist()
if suspects:
    print("\nModelos sospechosos de overfitting (gap grande):", suspects)
else:
    print("\nNo se ve una señal fuerte de overfitting con esta heurística.")


Gap de log-loss (test - train):
               model  logloss_train  logloss_test  logloss_gap
HistGradientBoosting       0.195743      0.205691     0.009948
    Baseline (prior)       0.437793      0.437474    -0.000319
  LogisticRegression       0.250158      0.241475    -0.008683

No se ve una señal fuerte de overfitting con esta heurística.


# **10. Stopping Criteria**

In [17]:
# Para este flujo: "paro" cuando el mejor modelo NO mejora logloss_test vs el anterior
# por un margen mínimo (delta) y el gap no empeora.
MIN_IMPROVEMENT = 0.002  # mejora mínima en log-loss para considerar que vale la pena
best = results_df.iloc[0]
print("\nMejor modelo actual:", best["model"])
print(f"logloss_test = {best['logloss_test']:.4f}")

print("\nRegla práctica:")
print(f"- Solo cambiar/complicar el modelo si mejora logloss_test al menos {MIN_IMPROVEMENT} y sin aumentar claramente el gap.")


Mejor modelo actual: HistGradientBoosting
logloss_test = 0.2057

Regla práctica:
- Solo cambiar/complicar el modelo si mejora logloss_test al menos 0.002 y sin aumentar claramente el gap.


# **11. Repeated Holdout**

In [19]:
SEEDS = range(10)        # número de repeticiones (puedes subir a 20 después)
TEST_SIZE = 0.2

def evaluate_once(seed, model, X, y):
    X_tr, X_te, y_tr, y_te = train_test_split(
        X,
        y,
        test_size=TEST_SIZE,
        stratify=y,
        random_state=seed
    )

    model.fit(X_tr, y_tr)

    p_tr = model.predict_proba(X_tr)[:, 1]
    p_te = model.predict_proba(X_te)[:, 1]

    return {
        "seed": seed,
        "logloss_train": log_loss(y_tr, p_tr, labels=[0, 1]),
        "logloss_test": log_loss(y_te, p_te, labels=[0, 1]),
        "gap": log_loss(y_te, p_te, labels=[0, 1]) - log_loss(y_tr, p_tr, labels=[0, 1])
    }


results = []

for seed in SEEDS:
    res_lr = evaluate_once(seed, logreg, X, y)
    res_lr["model"] = "LogisticRegression"
    results.append(res_lr)

    res_hgb = evaluate_once(seed, hgb, X, y)
    res_hgb["model"] = "HistGradientBoosting"
    results.append(res_hgb)

results_df = pd.DataFrame(results)

print("\nResultados por split:")
print(results_df.head())


Resultados por split:
   seed  logloss_train  logloss_test       gap                 model
0     0       0.247691      0.251067  0.003375    LogisticRegression
1     0       0.193794      0.214918  0.021124  HistGradientBoosting
2     1       0.247760      0.250638  0.002877    LogisticRegression
3     1       0.194230      0.227998  0.033768  HistGradientBoosting
4     2       0.247698      0.252312  0.004614    LogisticRegression


# **11.1 Statistical Summary**

In [20]:
summary = (
    results_df
    .groupby("model")
    .agg(
        logloss_test_mean=("logloss_test", "mean"),
        logloss_test_std=("logloss_test", "std"),
        gap_mean=("gap", "mean"),
        gap_std=("gap", "std"),
        n_splits=("logloss_test", "count")
    )
    .sort_values("logloss_test_mean")
)

print("\nResumen de validación repetida:")
print(summary.to_string())


Resumen de validación repetida:
                      logloss_test_mean  logloss_test_std  gap_mean   gap_std  n_splits
model                                                                                  
HistGradientBoosting           0.218446          0.011100  0.025058  0.014006        10
LogisticRegression             0.251424          0.007128  0.003199  0.008914        10


In [21]:
"""
El HistGradientBoosting sigue siendo claramente superior en promedio,
pero muestra algo más de variabilidad y un gap mayor que la regresión logística,
aunque sin señales críticas de sobreajuste.
"""

'\nEl HistGradientBoosting sigue siendo claramente superior en promedio,\npero muestra algo más de variabilidad y un gap mayor que la regresión logística,\naunque sin señales críticas de sobreajuste.\n'

# **12. Interpretation**

## **12.1 Logistic Regression**

In [22]:
# Extraer coeficientes de la Logistic Regression
feature_names = logreg.named_steps["preprocess"].get_feature_names_out()
coefs = logreg.named_steps["model"].coef_[0]

coef_df = (
    pd.DataFrame({
        "feature": feature_names,
        "coef": coefs
    })
    .sort_values("coef", ascending=False)
)

print(coef_df)

                            feature       coef
1              Book_length_mins_avg   2.490939
3                         Price_avg   1.454756
8                  Support_requests   1.072270
4                            Review   0.683568
9  Last_visited_Minus_Purchase_date   0.530067
5                      Review 10/10   0.062052
7                        Completion  -0.057207
2                     Price_overall  -1.208013
0          Book_length_mins_overall  -2.047615
6                  Minutes_listened -11.778497


## **12.2 HistGradientBoosting**

In [None]:
# Importancia de features del HGB
perm = permutation_importance(
    hgb,
    X_test,
    y_test,
    n_repeats=20,
    random_state=42,
    scoring="neg_log_loss"
)

feature_names = hgb.named_steps["preprocess"].get_feature_names_out()

importance_df = (
    pd.DataFrame({
        "feature": feature_names,
        "importance_mean": -perm.importances_mean,  # negativo → positivo = empeora
        "importance_std": perm.importances_std
    })
    .sort_values("importance_mean", ascending=False)
)

print(importance_df)
# Más negativo = más importante

                            feature  importance_mean  importance_std
4                            Review        -0.006868        0.001006
7                        Completion        -0.007951        0.001200
0          Book_length_mins_overall        -0.011441        0.001561
2                     Price_overall        -0.020985        0.001576
8                  Support_requests        -0.036538        0.005389
9  Last_visited_Minus_Purchase_date        -0.064472        0.003806
1              Book_length_mins_avg        -0.082967        0.007632
3                         Price_avg        -0.088929        0.004865
5                      Review 10/10        -0.242586        0.013259
6                  Minutes_listened        -0.687587        0.031926


In [None]:
"""
Interpretación:
La probabilidad de recompra está dominada por engagement real (minutos escuchados) y señales extremas de satisfacción (review 10/10). El precio promedio y la longitud típica del contenido modulan la decisión, mientras que métricas más genéricas (review promedio, completion) aportan poco una vez controlado el engagement.
"""

# **13. Using the Model for Decision Making**

In [26]:
# Construir el score operativo

p_test = hgb.predict_proba(X_test)[:, 1]

df_scores = X_test.copy()
df_scores["p_repurchase"] = p_test
df_scores["actual"] = y_test.values

df_scores.sort_values("p_repurchase", ascending=False).head()


Unnamed: 0,Book_length_mins_overall,Book_length_mins_avg,Price_overall,Price_avg,Review,Review 10/10,Minutes_listened,Completion,Support_requests,Last_visited_Minus_Purchase_date,p_repurchase,actual
324,1656.0,4968,6.22,18.66,0,8.91,0.0,475.2,1,141,0.995655,1
1315,1674.0,3348,6.09,12.17,0,8.91,0.0,0.0,0,247,0.995655,1
1383,1134.0,2268,6.93,13.87,0,8.91,0.0,0.0,0,30,0.995655,1
319,1656.0,4968,7.11,21.32,1,10.0,0.0,486.0,0,18,0.995655,1
1879,1890.0,3780,8.0,16.0,0,8.91,0.0,0.0,0,0,0.995655,1


In [27]:
# Segmentación por probabilidad

df_scores["segment"] = pd.cut(
    df_scores["p_repurchase"],
    bins=[0, 0.3, 0.6, 1.0],
    labels=["Low", "Medium", "High"]
)

df_scores["segment"].value_counts(normalize=True)


segment
Low       0.839901
High      0.082712
Medium    0.077387
Name: proportion, dtype: float64

In [28]:
# Evaluar lift por segmento:
df_scores.groupby("segment")["actual"].mean()


  df_scores.groupby("segment")["actual"].mean()


segment
Low       0.049451
Medium    0.463303
High      0.982833
Name: actual, dtype: float64

In [None]:
"""
Construimos un modelo de probabilidad de recompra validado fuera de muestra, identificamos las variables de engagement que lo impulsan, y lo convertimos en un sistema de segmentación accionable para optimizar campañas y evitar gasto ineficiente.
""" 