In [3]:
import os
import gzip
import json
import pickle
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
)
import warnings
warnings.filterwarnings('ignore')

def load_and_clean(path):
    """Carga y limpia el dataset"""
    df = pd.read_csv(path, compression="zip")
    df = df.rename(columns={"default payment next month": "default"})
    
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    
    # Eliminar registros con información no disponible
    df = df.dropna()
    
    # Agrupar valores de EDUCATION > 4 en categoría "others" (4)
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    
    return df

def split_xy(df):
    """Separa features y target"""
    x = df.drop(columns=["default"])
    y = df["default"]
    return x, y

def build_pipeline(categorical_cols, numerical_cols):
    """Construye el pipeline con todas las capas requeridas"""
    preprocessor = ColumnTransformer(
        transformers=[
            ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
            ("numerical", StandardScaler(), numerical_cols),  # StandardScaler inicial
        ]
    )
    
    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", PCA()),
            ("scaler", StandardScaler()),  # StandardScaler después de PCA
            ("select", SelectKBest(f_classif)),
            ("mlp", MLPClassifier(max_iter=1000, early_stopping=True, random_state=42))
        ]
    )
    
    return pipeline

def save_model(model, path):
    """Guarda el modelo comprimido"""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with gzip.open(path, "wb") as file:
        pickle.dump(model, file)

def compute_metrics(model, x, y, dataset_name):
    """Calcula métricas y matriz de confusión, retorna ambos diccionarios"""
    y_pred = model.predict(x)
    
    # Métricas
    metrics = {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y, y_pred),
        "recall": recall_score(y, y_pred, zero_division=0),
        "f1_score": f1_score(y, y_pred, zero_division=0),
    }
    
    # Matriz de confusión
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    cm = {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(tn), "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": int(tp)},
    }
    
    return metrics, cm

# =============================================================================
# EJECUCIÓN PRINCIPAL
# =============================================================================

print("Cargando datos...")

# Detectar la ruta correcta según dónde se ejecute el código
import os
if os.path.exists("files/input/train_data.csv.zip"):
    # Si se ejecuta desde la raíz del proyecto
    train_path = "files/input/train_data.csv.zip"
    test_path = "files/input/test_data.csv.zip"
    model_path = "files/models/model.pkl.gz"
    metrics_path = "files/output/metrics.json"
elif os.path.exists("../files/input/train_data.csv.zip"):
    # Si se ejecuta desde homework/
    train_path = "../files/input/train_data.csv.zip"
    test_path = "../files/input/test_data.csv.zip"
    model_path = "../files/models/model.pkl.gz"
    metrics_path = "../files/output/metrics.json"
else:
    raise FileNotFoundError("No se encontraron los archivos de datos. Verifica la estructura del proyecto.")

print(f"Usando ruta: {train_path}")

df_train = load_and_clean(train_path)
df_test = load_and_clean(test_path)

x_train, y_train = split_xy(df_train)
x_test, y_test = split_xy(df_test)

print(f"Train shape: {x_train.shape}")
print(f"Test shape: {x_test.shape}")

# Definir columnas categóricas y numéricas
categorical = ["SEX", "EDUCATION", "MARRIAGE"]
numerical = [col for col in x_train.columns if col not in categorical]

print("\nConstruyendo pipeline...")
pipeline = build_pipeline(categorical, numerical)

# OPTIMIZACIÓN: Grid de búsqueda reducido pero efectivo para MLP
param_grid = {
    "pca__n_components": [0.95],  # Mantener 95% de varianza
    "select__k": [15, 20],  # Probar con 15 y 20 features
    "mlp__hidden_layer_sizes": [(100,), (100, 50)],  # Arquitecturas simples
    "mlp__alpha": [0.001, 0.01],  # Regularización L2
}

print("\nIniciando GridSearchCV...")
print(f"Combinaciones a probar: 2 (k) × 2 (hidden) × 2 (alpha) = 8 × 10 folds = 80 entrenamientos")
print("(Esto tomará aproximadamente 5-10 minutos)")

model = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,  # Usa todos los cores disponibles
    verbose=1,  # Muestra progreso
)

model.fit(x_train, y_train)

print(f"\n{'='*60}")
print(f"Mejores parámetros: {model.best_params_}")
print(f"Mejor score (balanced_accuracy): {model.best_score_:.4f}")
print(f"{'='*60}")

# Guardar modelo
print("\nGuardando modelo...")
save_model(model, model_path)

# Guardar métricas
print("Calculando y guardando métricas...")
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)

# Calcular todas las métricas primero
metrics_train, cm_train = compute_metrics(model, x_train, y_train, "train")
metrics_test, cm_test = compute_metrics(model, x_test, y_test, "test")

# Guardar en el orden correcto: primero todas las métricas, luego todas las cm
with open(metrics_path, "w", encoding="utf-8") as writer:
    writer.write(json.dumps(metrics_train) + "\n")
    writer.write(json.dumps(metrics_test) + "\n")
    writer.write(json.dumps(cm_train) + "\n")
    writer.write(json.dumps(cm_test) + "\n")

print("\n" + "="*60)
print("✓ Entrenamiento completado exitosamente")
print(f"✓ Modelo guardado en {model_path}")
print(f"✓ Métricas guardadas en {metrics_path}")
print("="*60)

# Mostrar resultados finales
print(f"\nScore en train: {model.score(x_train, y_train):.4f}")
print(f"Score en test: {model.score(x_test, y_test):.4f}")

print("\nMétricas detalladas:")
print(f"Train - Precision: {metrics_train['precision']:.4f}, Recall: {metrics_train['recall']:.4f}, F1: {metrics_train['f1_score']:.4f}")
print(f"Test  - Precision: {metrics_test['precision']:.4f}, Recall: {metrics_test['recall']:.4f}, F1: {metrics_test['f1_score']:.4f}")

Cargando datos...
Usando ruta: ../files/input/train_data.csv.zip
Train shape: (21000, 23)
Test shape: (9000, 23)

Construyendo pipeline...

Iniciando GridSearchCV...
Combinaciones a probar: 2 (k) × 2 (hidden) × 2 (alpha) = 8 × 10 folds = 80 entrenamientos
(Esto tomará aproximadamente 5-10 minutos)
Fitting 10 folds for each of 8 candidates, totalling 80 fits

Mejores parámetros: {'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (100, 50), 'pca__n_components': 0.95, 'select__k': 20}
Mejor score (balanced_accuracy): 0.6583

Guardando modelo...
Calculando y guardando métricas...

✓ Entrenamiento completado exitosamente
✓ Modelo guardado en ../files/models/model.pkl.gz
✓ Métricas guardadas en ../files/output/metrics.json

Score en train: 0.6542
Score en test: 0.6660

Métricas detalladas:
Train - Precision: 0.6754, Recall: 0.3584, F1: 0.4683
Test  - Precision: 0.6637, Recall: 0.3845, F1: 0.4869
