In [2]:
#Imports

import os
import json
import gzip
import pickle 
import numpy as np
import pandas as pd    
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

In [3]:
#Paso 1: Cargar y preprocesar los datos

def load_train_test_data():
    train_df = pd.read_csv("../files/input/train_data.csv.zip", index_col=False)
    test_df = pd.read_csv("../files/input/test_data.csv.zip", index_col=False)
    return train_df, test_df


def preprocess_data(train_df, test_df):

    #Remover ID
    train_df.drop(columns=["ID"], inplace=True)
    test_df.drop(columns=["ID"], inplace=True)

    #Renombrar default
    train_df.rename(columns={"default payment next month": "default"}, inplace=True)
    test_df.rename(columns={"default payment next month": "default"}, inplace=True)

    #Remover NaNs
    train_df.dropna(inplace=True)
    test_df.dropna(inplace=True)

    #Corregir educacion
    train_df["EDUCATION"] = train_df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)
    test_df["EDUCATION"] = test_df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)

    #Remover no disponibles de marriage y education
    train_df =  train_df.query("MARRIAGE != 0 & EDUCATION != 0")
    test_df =  test_df.query("MARRIAGE != 0 & EDUCATION != 0")
    return train_df, test_df


train, test = load_train_test_data()
train, test = preprocess_data(train, test)

In [4]:
#Paso 2: Dividir los datos en conjuntos de entrenamiento y prueba

def make_train_test_split(train_df, test_df):

    #Usar la columna default como target
    X_train = train_df.drop(columns=["default"])
    X_test = test_df.drop(columns=["default"])
    y_train = train_df["default"]
    y_test = test_df["default"]
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = make_train_test_split(train, test)

#Verficar las dimensiones de los conjuntos
print("Dimensiones del conjunto de entrenamiento:", X_train.shape, y_train.shape)
print("Dimensiones del conjunto de prueba:", X_test.shape, y_test.shape)

Dimensiones del conjunto de entrenamiento: (20953, 23) (20953,)
Dimensiones del conjunto de prueba: (8979, 23) (8979,)


In [5]:
#Paso 3: Construir el pipeline de preprocesamiento y modelado

def make_pipeline(estimator):

    #Columnas categóricas y numéricas
    categorical_cols = ["MARRIAGE", "EDUCATION", "SEX"]
    numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

    transformer = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(dtype="int", handle_unknown="ignore"), categorical_cols),
            ("num", MinMaxScaler(feature_range=(0, 1)), numerical_cols),
        ],
        remainder="passthrough",
    )

    selectkbest_feature = SelectKBest(score_func=f_classif, k="all")

    pipeline = Pipeline(
        steps=[
            ("transformer", transformer),
            ("selectkbest", selectkbest_feature),
            ("estimator", estimator),
        ],
        verbose=False
    )

    return pipeline


lr = LogisticRegression(random_state=42, max_iter=4000)
estimator_pipeline = make_pipeline(lr)

In [6]:
# Paso 4: Buscar hiperparámetros óptimos usando GridSearchCV

def perform_grid_search(estimator_pipeline, X_train, y_train):

    param_grid = {
        "selectkbest__k": [1, 8, 12],
        "estimator__penalty": ["l1", "l2"],
        "estimator__solver": ["liblinear"],
        "estimator__C": [1, 10],
        "estimator__class_weight": [None, "balanced"],
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    scorers = {
        "accuracy": "accuracy",
        "balanced_accuracy": make_scorer(balanced_accuracy_score),
    }

    model = GridSearchCV(
        estimator=estimator_pipeline,
        param_grid=param_grid,
        scoring=scorers,
        refit = "accuracy",
        cv=cv,
        n_jobs=-1,
        verbose=0,
    )

    model.fit(X_train, y_train)
    return model

model = perform_grid_search(estimator_pipeline, X_train, y_train)

In [7]:
# Paso 5: Guardar el modelo

# Verificar existencia del directorio
os.makedirs("../files/models/", exist_ok=True)

# Guardar el modelo GridSearchCV completo
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(model, f)

In [8]:
# Paso 6 y 7: Calcular y guardar métricas de evaluación

def save_evaluation_metrics(model, X_train, y_train, X_test, y_test):
    
    # Realizar predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calcular métricas de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    results =[
        
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0) 
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0) 
        },
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0][0]), 'predicted_1': int(cm_train[0][1])},
            'true_1': {'predicted_0': int(cm_train[1][0]), 'predicted_1': int(cm_train[1][1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0][0]), 'predicted_1': int(cm_test[0][1])},
            'true_1': {'predicted_0': int(cm_test[1][0]), 'predicted_1': int(cm_test[1][1])}
        }
    ]

    # Verificar existencia del directorio
    os.makedirs("../files/output", exist_ok=True)

    # Guardar métricas en un archivo JSON
    with open("../files/output/metrics.json", "w") as file:
        for record in results:
            json.dump(record, file)
            file.write("\n")

save_evaluation_metrics(model, X_train, y_train, X_test, y_test)