In [1]:
# Importamos librerías
import pandas as pd
import os
import gzip
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix

In [2]:
# Paso 1: Limpieza de datos
# Cargar datasets
df_train = pd.read_csv("../files/input/train_data.csv.zip", compression="zip")
df_test = pd.read_csv("../files/input/test_data.csv.zip", compression="zip")

# Renombrar la columna "default payment next month" a "default"
df_train.rename(columns={'default payment next month': 'default'}, inplace=True)
df_test.rename(columns={'default payment next month': 'default'}, inplace=True)

# Remover la columna "ID"
df_train.drop(columns='ID', inplace=True)
df_test.drop(columns='ID', inplace=True)

# Eliminar valores de 0 en las columnas "MARRIAGE" y "EDUCATION"
df_train = df_train[df_train["MARRIAGE"] != 0]
df_train = df_train[df_train["EDUCATION"] != 0]
df_test = df_test[df_test["MARRIAGE"] != 0]
df_test = df_test[df_test["EDUCATION"] != 0]

# Agrupar valores de EDUCATION > 4 en la categoría "others"
df_train['EDUCATION'] = df_train['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
df_test['EDUCATION'] = df_test['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

In [3]:
# Paso 2: División en conjuntos de entrenamiento y prueba
X_train = df_train.drop(columns='default')
y_train = df_train['default']
X_test = df_test.drop(columns='default')
y_test = df_test['default']

In [4]:
# Paso 3: Crear pipeline
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric_features = [col for col in X_train.columns if col not in categorical_features]

# Preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', MinMaxScaler(), numeric_features)
    ],
    remainder="passthrough"
)

# Pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', LogisticRegression(random_state=42))
])

In [5]:
# Paso 4: Optimización de hiperparámetros
param_grid = {
    'feature_selection__k': range(1, 11),  # Selección de características
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],     
    'classifier__penalty': ['l1','l2'],       # Tipo de penalización,
    'classifier__solver': ['liblinear'],
    'classifier__max_iter': [100,200]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='balanced_accuracy',
    cv=10,
    verbose=2,
    n_jobs=-1,
    refit=True
)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits


In [6]:
# Mejor modelo
best_model = grid_search
print("Mejores parámetros:", grid_search.best_params_)

Mejores parámetros: {'classifier__C': 0.1, 'classifier__max_iter': 100, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'feature_selection__k': 1}


In [7]:
# Paso 5: Guardar modelo
import pickle
os.makedirs('../files/models/', exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [8]:
# Paso 6: Calcular métricas
def calculate_metrics(y_true, y_pred, dataset_name):
    precision = precision_score(y_true, y_pred, zero_division=0)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision,
        "balanced_accuracy": balanced_acc,
        "recall": recall,
        "f1_score": f1
    }

# Predicciones
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Métricas
train_metrics = calculate_metrics(y_train, y_pred_train, "train")
test_metrics = calculate_metrics(y_test, y_pred_test, "test")

In [9]:
# Paso 7: Calcular matrices de confusión
def calculate_confusion_matrix(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred)
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

train_cm = calculate_confusion_matrix(y_train, y_pred_train, "train")
test_cm = calculate_confusion_matrix(y_test, y_pred_test, "test")

# Guardar métricas y matrices de confusión
all_metrics = [train_metrics, test_metrics, train_cm, test_cm]

os.makedirs('../files/output/', exist_ok=True)
with open('../files/output/metrics.json', 'w') as f:
    for metric in all_metrics:
        f.write(json.dumps(metric) + '\n')