In [2]:
import pandas as pd
import numpy as np
import json
import gzip
import os
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.impute import SimpleImputer 


In [3]:
# ===============================================================================
#                                  FUNCIONES AUXILIARES
# ===============================================================================

def save_estimator(estimator, path="../files/models/model.pkl.gz"):
    """Paso 5: Guarda el modelo comprimido con gzip."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with gzip.open(path, 'wb') as f:
        pickle.dump(estimator, f)
    
def make_grid_search(estimator, param_grid, cv, scoring='balanced_accuracy', verbose=1, n_jobs=-1):
    """Paso 4: Crea y ejecuta GridSearchCV."""
    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        verbose=verbose,
        n_jobs=n_jobs
    )
    return grid_search

def format_cm_for_json(cm, dataset_name):
    """Paso 7: Formatea la matriz de confusi√≥n seg√∫n el requisito."""
    return {
        'type': 'cm_matrix',
        'dataset': dataset_name,
     
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

def evaluate_and_save_metrics(model, X_train, y_train, X_test, y_test, output_path):
    """Paso 6 & 7: Calcula y guarda todas las m√©tricas y CMs en formato JSON por l√≠nea."""
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # M√©tricas para Train
    metrics_train = {
        'type': 'metrics', 
        'dataset': 'train', 
        'precision': precision_score(y_train, y_train_pred), 
        'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred), 
        'recall': recall_score(y_train, y_train_pred), 
        'f1_score': f1_score(y_train, y_train_pred)
    }
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_train_dict = format_cm_for_json(cm_train, 'train')

    # M√©tricas para Test
    metrics_test = {
        'type': 'metrics', 
        'dataset': 'test', 
        'precision': precision_score(y_test, y_test_pred), 
        'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred), 
        'recall': recall_score(y_test, y_test_pred), 
        'f1_score': f1_score(y_test, y_test_pred)
    }
    cm_test = confusion_matrix(y_test, y_test_pred)
    cm_test_dict = format_cm_for_json(cm_test, 'test')
    
    data_to_save = [metrics_train, metrics_test, cm_train_dict, cm_test_dict]
    
    with open(output_path, 'w') as f:
        for item in data_to_save:
            f.write(json.dumps(item) + '\n')

In [4]:
# ===============================================================================
#                                  PASO 1 & 2: CARGA Y LIMPIEZA
# ===============================================================================

def carga_y_limpieza(train_path, test_path):

    train_dataset = pd.read_csv(train_path, compression='zip')
    test_dataset = pd.read_csv(test_path, compression='zip')
        
    # 1.1 Renombrar 'default payment next month' a 'default' y remover 'ID'.
    train_dataset.rename(columns={"default payment next month": "default"}, inplace=True)
    test_dataset.rename(columns={"default payment next month": "default"}, inplace=True)
    train_dataset.drop(columns=["ID"], inplace=True)
    test_dataset.drop(columns=["ID"], inplace=True)

    # 1.2 Agrupar valores > 4 y 0 en EDUCATION en la categor√≠a 'others' (4)
    def clean_education(df):
        df['EDUCATION'] = np.where(df['EDUCATION'] > 4, 4, df['EDUCATION'])
        df['EDUCATION'] = np.where(df['EDUCATION'] == 0, 4, df['EDUCATION'])
        return df

    train_dataset = clean_education(train_dataset)
    test_dataset = clean_education(test_dataset)

    # 1.3 Eliminar registros con informaci√≥n no disponible (NaN)
    train_dataset.dropna(inplace=True)
    test_dataset.dropna(inplace=True)

    # PASO 2. Dividir en X e y
    X_train = train_dataset.drop(columns=['default'])
    y_train = train_dataset['default']
    X_test = test_dataset.drop(columns=['default'])
    y_test = test_dataset['default']

    # Identificar columnas Categ√≥ricas y Num√©ricas

    pay_columns = [f"PAY_{i}" for i in [0, 2, 3, 4, 5, 6]]
    categorical_columns = ["SEX", "EDUCATION", "MARRIAGE"] + pay_columns
    
    bill_columns = [f"BILL_AMT{i}" for i in range(1, 7)]
    pay_amt_columns = [f"PAY_AMT{i}" for i in range(1, 7)]
    numeric_columns = ["LIMIT_BAL", "AGE"] + bill_columns + pay_amt_columns

    # ASEGURAR TIPOS: Forzar tipos para evitar TypeErrors
    for col in categorical_columns:
        if col in X_train.columns:
            X_train[col] = X_train[col].astype(str) 
        if col in X_test.columns:
            X_test[col] = X_test[col].astype(str)

    for col in numeric_columns:
        if col in X_train.columns:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce').astype(float)
        if col in X_test.columns:
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce').astype(float)
            
    # √öltimo dropna por si la conversi√≥n forzada introdujo NaN
    X_train.dropna(inplace=True)
    y_train = y_train.loc[X_train.index]
    X_test.dropna(inplace=True)
    y_test = y_test.loc[X_test.index]
    
    print("‚úÖ Carga y limpieza completadas. Tipos de datos asegurados.")
    
    return X_test, X_train, y_test, y_train, categorical_columns, numeric_columns

In [None]:
# ===============================================================================
#                                  PASO 3-7: ENTRENAMIENTO Y EVALUACI√ìN
# ===============================================================================

def train_and_evaluate_logistic_regression():
    
    train_path = "../files/input/train_data.csv.zip"
    test_path = "../files/input/test_data.csv.zip"
    
    X_test, X_train, y_test, y_train, categorical_columns, numeric_columns = carga_y_limpieza(train_path, test_path)

    # PASO 3: Crear el Pipeline
    
    # 3.1 Transformadores
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())
    ])
    
    # Transformador Categ√≥rico: Imputer (para NaN) -> OneHotEncoder
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_columns),
            ('cat', categorical_transformer, categorical_columns)
        ],
        remainder='passthrough'
    )
    
    # 3.2 Pipeline Completo
    pipeline = Pipeline(steps=[
        ('preprocesador', preprocessor), 
        # CR√çTICO: Usar chi2 para seleccionar mejores features despu√©s del OHE
        ('selectkbest', SelectKBest(score_func=chi2)), 
        ('estimator', LogisticRegression(solver='liblinear', random_state=42, max_iter=5000))
    ])
    
    print("‚úÖ Pipeline creado y robustecido contra TypeErrors.")
    
    # PASO 4: Optimizaci√≥n de Hiperpar√°metros (GridSearch)
    param_grid = {
        # Ampliar el rango de K para capturar features importantes
        'selectkbest__k': [40, 60, 80, 'all'], 
        # CR√çTICO: Explorar valores m√°s altos de C (menos regularizaci√≥n)
        'estimator__C': [0.1, 1, 10, 50, 200, 500], 
        'estimator__penalty': ['l1', 'l2'], 
        'estimator__class_weight': [None, 'balanced']
    }

    grid_search = make_grid_search(
        estimator=pipeline,
        param_grid=param_grid,
        cv=10, 
        scoring='balanced_accuracy', # Usar balanced_accuracy como m√©trica principal
        verbose=1,
        n_jobs=-1 
    )
    
    print("\nIniciando b√∫squeda de hiperpar√°metros (GridSearchCV)...")
    grid_search.fit(X_train, y_train)
    
    # PASO 5: Guardar el mejor modelo
    best_model = grid_search
    save_estimator(best_model)
    print("\n‚úÖ Mejor modelo (GridSearch) guardado en 'files/models/model.pkl.gz'")

    # PASO 6 & 7: Calcular y guardar m√©tricas y matrices de confusi√≥n
    output_path = "../files/output/metrics.json"
    evaluate_and_save_metrics(best_model, X_train, y_train, X_test, y_test, output_path)
    
    print(f"\n‚úÖ M√©tricas y matrices de confusi√≥n guardadas en '{output_path}'")
    print("\nüéâ Entrenamiento y evaluaci√≥n completados.")

# Ejecutar el flujo completo
if __name__ == '__main__':
    train_and_evaluate_logistic_regression()

‚úÖ Carga y limpieza completadas. Tipos de datos asegurados.
‚úÖ Pipeline creado y robustecido contra TypeErrors.

Iniciando b√∫squeda de hiperpar√°metros (GridSearchCV)...
Fitting 10 folds for each of 96 candidates, totalling 960 fits

‚úÖ Mejor modelo (GridSearch) guardado en 'files/models/model.pkl.gz'

‚úÖ M√©tricas y matrices de confusi√≥n guardadas en '../files/output/metrics.json'

üéâ Entrenamiento y evaluaci√≥n completados. ¬°El modelo deber√≠a pasar el test!
