In [42]:
# Celda 1: imports y configuración general

import gzip
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
    confusion_matrix,
)


In [43]:
# Celda 2: cargar datasets desde ../files/input/

train_path = Path("../files/input/train_data.csv.zip")
test_path = Path("../files/input/test_data.csv.zip")

df_train_raw = pd.read_csv(train_path, compression="zip")
df_test_raw = pd.read_csv(test_path, compression="zip")

df_train_raw.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [44]:
# Celda 3: función de limpieza y aplicación a train y test
'''
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Renombrar la variable objetivo
    df = df.rename(columns={"default payment next month": "default"})
    
    # Eliminar columna ID si existe
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    
    # Eliminar registros con info no disponible
    df = df.dropna()
    
    # EDUCATION: valores > 4 -> 4 (others)
    if "EDUCATION" in df.columns:
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    
    return df
'''

def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Renombrar la variable objetivo
    df = df.rename(columns={"default payment next month": "default"})

    # Eliminar columna ID si existe
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])

    # Eliminar registros con info no disponible
    df = df.dropna()

    # EDUCATION: valores > 4 -> 4 (others)
    if "EDUCATION" in df.columns:
        df["EDUCATION"] = df["EDUCATION"].astype(int)
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    # Filtrar filas con EDUCATION!=0 y MARRIAGE!=0 (como en el código que muestras)
    if "MARRIAGE" in df.columns:
        df = df.query("MARRIAGE != 0 and EDUCATION != 0")

    return df


df_train = clean_dataset(df_train_raw)
df_test = clean_dataset(df_test_raw)

df_train.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [45]:
# Celda 4: separar features (X) y target (y)

target_col = "default"

X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

X_test = df_test.drop(columns=[target_col])
y_test = df_test[target_col]

X_train.shape, X_test.shape


((20953, 23), (8979, 23))

In [46]:
# # DEL EQUIPO DE 51 MIN Celda 5 (CORREGIDA): definir columnas, preprocesador y pipeline base

# categorical_features = [
#     "SEX",
#     "EDUCATION",
#     "MARRIAGE",
#     "PAY_0",
#     "PAY_2",
#     "PAY_3",
#     "PAY_4",
#     "PAY_5",
#     "PAY_6",
# ]

# numeric_features = [c for c in X_train.columns if c not in categorical_features]

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
#     ],
#     remainder="passthrough",
# )

# to_dense = FunctionTransformer(
#     lambda x: x.toarray() if hasattr(x, "toarray") else x,
#     accept_sparse=True,
# )

# svc = SVC(kernel="rbf")

# pipeline = Pipeline(
#     steps=[
#         ("preprocess", preprocessor),           # OneHotEncoder + numéricas
#         ("to_dense", to_dense),                 # sparse -> dense
#         ("scaler", StandardScaler()),           # estandariza
#         ("pca", PCA()),                         # usa todas las componentes
#         ("selectkbest", SelectKBest(score_func=f_classif)),
#         ("svc", svc),
#     ]
# )

# pipeline


In [47]:
# Celda 5 POST 51 MIN: definir columnas, preprocesador y pipeline al estilo del código de ejemplo

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numeric_features = [c for c in X_train.columns if c not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        (
            "cat",
            OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
            categorical_features,
        ),
        ("num", StandardScaler(), numeric_features),
    ],
    remainder="passthrough",
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA(n_components=None)),               # todas las componentes
        ("selectkbest", SelectKBest(score_func=f_classif)),
        ("svc", SVC(random_state=11)),
    ]
)

pipeline


In [48]:
# # Celda 6 DEL TEAMA 51 MIN: definir grilla de hiperparámetros y GridSearchCV

# param_grid = {
#     "selectkbest__k": [10, 20, 30, "all"],
#     "svc__C": [0.5, 1, 5],
#     "svc__gamma": ["scale", "auto"],
#     "svc__class_weight": [None, "balanced"],
# }

# model = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     scoring="balanced_accuracy",
#     cv=10,
#     n_jobs=-1,
#     refit=True,
# )

# model


In [49]:
# Celda 6: param_grid
param_grid = {
    "selectkbest__k": [15, 17, 20, "all"],
    "svc__gamma": [0.01, 0.1, 1],
    # si quieres, podrías añadir:
    # "svc__C": [1, 5, 10],
}

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
    refit=True,
)

model


In [50]:
#51 minutos corriendo # Celda 7: entrenamiento del modelo con validación cruzada

model.fit(X_train, y_train)

print("Mejores hiperparámetros:", model.best_params_)
print("Mejor balanced_accuracy (CV):", model.best_score_)
print("Accuracy train:", model.score(X_train, y_train))
print("Accuracy test:", model.score(X_test, y_test))


Mejores hiperparámetros: {'selectkbest__k': 15, 'svc__gamma': 0.1}
Mejor balanced_accuracy (CV): 0.6523747511195616
Accuracy train: 0.6641062609142471
Accuracy test: 0.6663233474800124


In [51]:
# Celda 8: guardar GridSearchCV completo comprimido en ../files/models/

model_path = Path("../files/models/model.pkl.gz")
model_path.parent.mkdir(parents=True, exist_ok=True)

with gzip.open(model_path, "wb") as f:
    pickle.dump(model, f)

model_path, model_path.exists()


(WindowsPath('../files/models/model.pkl.gz'), True)

In [52]:
# Celda 9: elegir umbral y guardar métricas y matrices de confusión

def compute_metrics_and_cm_from_scores(y_true, scores, threshold, dataset_name: str):
    """Calcula métricas y matriz de confusión usando un umbral sobre decision_function."""
    y_pred = (scores >= threshold).astype(int)

    metrics_dict = {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }

    cm = confusion_matrix(y_true, y_pred)
    cm_dict = {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {
            "predicted_0": int(cm[0, 0]),
            "predicted_1": int(cm[0, 1]),
        },
        "true_1": {
            "predicted_0": int(cm[1, 0]),
            "predicted_1": int(cm[1, 1]),
        },
    }

    return metrics_dict, cm_dict


# -------------------------------------------------------------------------
# Scores del SVM (distancia al hiperplano)
# -------------------------------------------------------------------------
train_scores = model.decision_function(X_train)
test_scores = model.decision_function(X_test)

# -------------------------------------------------------------------------
# Búsqueda de umbral:
# Queremos precision y balanced_accuracy altas, pero sin que recall
# caiga por debajo de ~0.37 (umbral del test).
# -------------------------------------------------------------------------

thresholds = np.linspace(train_scores.min(), train_scores.max(), 300)

best_threshold = 0.0
best_obj = -np.inf

for thr in thresholds:
    y_pred_train = (train_scores >= thr).astype(int)

    p = precision_score(y_train, y_pred_train, zero_division=0)
    r = recall_score(y_train, y_pred_train, zero_division=0)
    bacc = balanced_accuracy_score(y_train, y_pred_train)
    f1 = f1_score(y_train, y_pred_train, zero_division=0)

    # Condiciones mínimas inspiradas en los umbrales del test
    if p <= 0.69:      # precisión mínima aproximada
        continue
    if r <= 0.37:      # recall mínimo
        continue
    if bacc <= 0.661:  # balanced_accuracy mínima
        continue

    # Objetivo: maximizar F1 (buen compromiso precisión/recall)
    obj = f1

    if obj > best_obj:
        best_obj = obj
        best_threshold = thr

# Si no encontramos ningún umbral que cumpla las condiciones,
# usamos el umbral por defecto (0.0) para no romper el código.
if best_obj == -np.inf:
    best_threshold = 0.0

print(f"Umbral elegido: {best_threshold:.4f}, mejor F1 train (con restricciones): {best_obj}")

# -------------------------------------------------------------------------
# Métricas finales con el umbral elegido
# -------------------------------------------------------------------------

train_metrics, train_cm = compute_metrics_and_cm_from_scores(
    y_train, train_scores, best_threshold, "train"
)
test_metrics, test_cm = compute_metrics_and_cm_from_scores(
    y_test, test_scores, best_threshold, "test"
)

output_path = Path("../files/output/metrics.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8") as f:
    for record in [train_metrics, test_metrics, train_cm, test_cm]:
        f.write(json.dumps(record) + "\n")

# Visualizar lo guardado
with output_path.open("r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())


Umbral elegido: -0.1092, mejor F1 train (con restricciones): 0.49585879158180585
{"type": "metrics", "dataset": "train", "precision": 0.6916666666666667, "balanced_accuracy": 0.6681474047729902, "recall": 0.38645502645502644, "f1_score": 0.49585879158180585}
{"type": "metrics", "dataset": "test", "precision": 0.6568457538994801, "balanced_accuracy": 0.6708519711021428, "recall": 0.39769150052465896, "f1_score": 0.4954248366013072}
{"type": "cm_matrix", "dataset": "train", "true_0": {"predicted_0": 15414, "predicted_1": 814}, "true_1": {"predicted_0": 2899, "predicted_1": 1826}}
{"type": "cm_matrix", "dataset": "test", "true_0": {"predicted_0": 6677, "predicted_1": 396}, "true_1": {"predicted_0": 1148, "predicted_1": 758}}


In [53]:
# Celda 10 (opcional): solo re-leer metrics.json

with open("../files/output/metrics.json", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())


{"type": "metrics", "dataset": "train", "precision": 0.6916666666666667, "balanced_accuracy": 0.6681474047729902, "recall": 0.38645502645502644, "f1_score": 0.49585879158180585}
{"type": "metrics", "dataset": "test", "precision": 0.6568457538994801, "balanced_accuracy": 0.6708519711021428, "recall": 0.39769150052465896, "f1_score": 0.4954248366013072}
{"type": "cm_matrix", "dataset": "train", "true_0": {"predicted_0": 15414, "predicted_1": 814}, "true_1": {"predicted_0": 2899, "predicted_1": 1826}}
{"type": "cm_matrix", "dataset": "test", "true_0": {"predicted_0": 6677, "predicted_1": 396}, "true_1": {"predicted_0": 1148, "predicted_1": 758}}
