In [1]:
import pandas as pd

# Paso 1: cargar datos crudos
train = pd.read_csv("../files/input/train_data.csv.zip")
test = pd.read_csv("../files/input/test_data.csv.zip")

# Paso 1: limpieza según enunciado
for df in (train, test):
    # Renombrar la columna objetivo
    df.rename(columns={"default payment next month": "default"}, inplace=True)

    # Eliminar columna ID
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

    # Eliminar registros con información no disponible (NaN)
    df.dropna(axis=0, inplace=True)

    # EDUCATION: valores > 4 se agrupan en la categoría "others" (4)
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4



In [2]:
# Paso 2: separar variables explicativas (X) y objetivo (y)

# Conjunto de entrenamiento
X_train = train.drop(columns=["default"])
y_train = train["default"].copy()

# Conjunto de prueba
X_test = test.drop(columns=["default"])
y_test = test["default"].copy()


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC

# Definir columnas categóricas y numéricas
categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numeric_features = [c for c in X_train.columns if c not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA()),                      # usa todas las componentes
        ("scaler", StandardScaler()),       # estandariza
        ("selectkbest", SelectKBest(score_func=f_classif, k=20)),
        ("svc", SVC()),                     # SVC base; lo afinamos en el grid
    ]
)
# El resto se tratan como numéricas

In [4]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Paso 4: validación cruzada y búsqueda de hiperparámetros (grid amplio)

param_grid = {
    # no pierdas demasiada info antes de SVC
    "selectkbest__k": [15, 20, 23, 26, 30],

    # C: margen más estricto a más grande
    "svc__C": [1, 2, 5, 10, 20, 50, 100],

    # gamma: varios niveles razonables para datos estandarizados
    "svc__gamma": ["scale", 0.1, 0.05, 0.01, 0.005, 0.001],

    # probamos sin y con ponderación de clases
    "svc__class_weight": [None, "balanced"],

    "svc__kernel": ["rbf"],
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",  # criterio de selección
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

grid_search.fit(X_train, y_train)

best_model = grid_search


Fitting 10 folds for each of 420 candidates, totalling 4200 fits


In [5]:
#5
import os
import gzip
import pickle

# Paso 5: guardar el modelo comprimido
os.makedirs("../files/models", exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(best_model, file)


In [6]:
import os
import json
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score,
)

# Paso 6: métricas en train y test para la clase 0 (clientes que pagan)

# Predicciones del modelo ya entrenado
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

metrics_list = []

for dataset_name, y_true, y_pred in [
    ("train", y_train, y_train_pred),
    ("test", y_test, y_test_pred),
]:
    metrics_list.append(
        {
            "type": "metrics",
            "dataset": dataset_name,
            # Métricas respecto a la clase 0
            "precision": precision_score(
                y_true,
                y_pred,
                pos_label=0,
                zero_division=0,
            ),
            "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
            "recall": recall_score(
                y_true,
                y_pred,
                pos_label=0,
                zero_division=0,
            ),
            "f1_score": f1_score(
                y_true,
                y_pred,
                pos_label=0,
                zero_division=0,
            ),
        }
    )

os.makedirs("../files/output", exist_ok=True)

# Sobrescribe metrics.json con SOLO los registros de métricas agregadas
with open("../files/output/metrics.json", "w", encoding="utf-8") as f:
    for m in metrics_list:
        f.write(json.dumps(m) + "\n")


In [7]:
from sklearn.metrics import confusion_matrix
import json

# Paso 7: matrices de confusión usando las predicciones "por defecto" del modelo

y_train_pred_cm = best_model.predict(X_train)
y_test_pred_cm = best_model.predict(X_test)

cm_train = confusion_matrix(y_train, y_train_pred_cm, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_test_pred_cm, labels=[0, 1])

cm_entries = []

# Train
cm_entries.append(
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {
            "predicted_0": int(cm_train[0, 0]),
            "predicted_1": int(cm_train[0, 1]),
        },
        "true_1": {
            "predicted_0": int(cm_train[1, 0]),
            "predicted_1": int(cm_train[1, 1]),
        },
    }
)

# Test
cm_entries.append(
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {
            "predicted_0": int(cm_test[0, 0]),
            "predicted_1": int(cm_test[0, 1]),
        },
        "true_1": {
            "predicted_0": int(cm_test[1, 0]),
            "predicted_1": int(cm_test[1, 1]),
        },
    }
)

# Añadir al mismo archivo usado en el Paso 6
with open("../files/output/metrics.json", "a", encoding="utf-8") as f:
    for m in cm_entries:
        f.write(json.dumps(m) + "\n")
