In [1]:
# Cargar y limpiar los datos

import pandas as pd
import numpy as np
import os
import pickle
import joblib
import json
import gzip
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix
)


# Leer los datos 
train_df = pd.read_csv("../files/input/train_data.csv.zip")
test_df = pd.read_csv("../files/input/test_data.csv.zip")


for df in [train_df, test_df]:
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.replace({"EDUCATION": {0: 4, 5: 4, 6: 4}, "MARRIAGE": {0: 3}}, inplace=True)
    df.dropna(inplace=True)


In [2]:
# variables predictoras

X_train = train_df.drop(columns=["default"])
y_train = train_df["default"]
X_test = test_df.drop(columns=["default"])
y_test = test_df["default"]


In [3]:

# Crear el pipeline completo

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [col for col in X_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ],
    remainder='passthrough'
)

def create_sample_weights(y):
    class_counts = np.bincount(y)
    total_samples = len(y)
    weight_for_0 = total_samples / (2.0 * class_counts[0])
    weight_for_1 = total_samples / (2.0 * class_counts[1]) * 1.8  # Factor de ajuste
    return np.where(y == 0, weight_for_0, weight_for_1)

sample_weights = create_sample_weights(y_train)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.70)),  # Reducción muy agresiva
    ('selector', SelectKBest(f_classif, k=5)),  # Solo las 5 mejores features
    ('classifier', MLPClassifier(
        hidden_layer_sizes=(128, 64),  # Arquitectura más simple
        activation='tanh',
        alpha=0.001,  # Mayor regularización
        early_stopping=True,
        learning_rate_init=0.001,
        max_iter=2000,
        random_state=42,
        batch_size=256,
        solver='adam'
    ))
])






In [4]:
# Optimizar hiperparámetros con validación cruzada


grid_search = GridSearchCV(
    pipeline,
    param_grid={
        'pca__n_components': [0.75, 0.80],
        'classifier__alpha': [0.0003]
    },
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='precision',  # Enfocado directamente en precisión
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train, classifier__sample_weight=sample_weights)

# 4. Predicción con ajuste de threshold optimizado
y_probs = grid_search.predict_proba(X_test)[:, 1]
best_threshold = 0.62  # Threshold más alto para precisión
y_pred_final = (y_probs > best_threshold).astype(int)

# 5. Métricas finales
print(f"\nPrecisión: {precision_score(y_test, y_pred_final):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_final):.3f}")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred_final):.3f}")

Fitting 5 folds for each of 2 candidates, totalling 10 fits

Precisión: 0.381
Recall: 0.645
Balanced Accuracy: 0.681


In [5]:
#  Guardar el modelo entrenado

model_path = "../files/models/model.pkl.gz"
os.makedirs(os.path.dirname(model_path), exist_ok=True)

with gzip.open(model_path, "wb") as f:
    pickle.dump(grid_search, f)


In [6]:
# Calcular métricas

y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

def compute_metrics(y_true, y_pred, dataset_name):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
    }

metrics = [
    compute_metrics(y_train, y_pred_train, "train"),
    compute_metrics(y_test, y_pred_test, "test"),
]


In [7]:
#  Calcular matrices de confusión y guardar 
def compute_confusion(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }

metrics.append(compute_confusion(y_train, y_pred_train, "train"))
metrics.append(compute_confusion(y_test, y_pred_test, "test"))

metrics_path = "../files/output/metrics.json"
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)

with open(metrics_path, "w", encoding="utf-8") as f:
    for item in metrics:
        json.dump(item, f)
        f.write("\n")

