In [9]:
import pandas as pd

test_data = pd.read_csv("../files/input/test_data.csv.zip",index_col=False,compression="zip")
train_data = pd.read_csv("../files/input/train_data.csv.zip",index_col=False,compression="zip")

In [10]:
#paso 1

test_data = test_data.rename(columns={'default payment next month': 'default'})
train_data = train_data.rename(columns={'default payment next month': 'default'})
train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]
test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data=test_data.drop(columns=['ID'])
train_data=train_data.drop(columns=['ID'])


In [11]:
#paso 2
x_train=train_data.drop(columns="default")
y_train=train_data["default"]
x_test=test_data.drop(columns="default")
y_test=test_data["default"]

In [12]:
#paso 3
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.feature_selection import f_classif,SelectKBest

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = ["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3"]  

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", MinMaxScaler(), numerical_features),
    ],
    remainder="passthrough"  
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),  
        ("feature_selector", SelectKBest(score_func=f_classif)), 
        ("classifier", LogisticRegression(random_state=42))  
    ]
)


In [13]:
#paso 4
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score

# Definir los hiperparámetros a optimizar
param_grid = {
    'feature_selector__k':range(1, 10),
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'classifier__penalty': ['l1','l2'],#Tipo de regularización a aplicar.
    #El algoritmo utilizado para optimizar el modelo:
    'classifier__solver': ['liblinear'],
    #El número máximo de iteraciones que el algoritmo de optimización debe 
    # realizar.
    'classifier__max_iter': [100,200],
}
# Crear el modelo con validación cruzada
model = GridSearchCV(
    pipeline,  # Pipeline definido anteriormente
    param_grid=param_grid,  # Espacio de hiperparámetros
    cv=10,  # Validación cruzada con 10 splits
    scoring=make_scorer(balanced_accuracy_score),  # Usar precisión balanceada
    n_jobs=-1,  # Usar todos los núcleos disponibles
    refit=True,  # Ajustar el modelo con los mejores hiperparámetros
    verbose=1  # Mostrar progreso
)

# Ajustar el modelo con los datos de entrenamiento
model.fit(x_train, y_train)


Fitting 10 folds for each of 216 candidates, totalling 2160 fits


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [14]:
#paso 5
import pickle
import os
import gzip

os.makedirs("../files/models", exist_ok=True)

model_path = "../files/models/model.pkl.gz"




with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(model, f)

In [15]:
#paso 6
#paso 6
import os
import json
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred, dataset_name):
    # Métricas
    metrics = {
        "type": "metrics",  
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, average='binary'),         
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),          
        "recall": recall_score(y_true, y_pred, average='binary'),              
        "f1_score": f1_score(y_true, y_pred, average='binary')                 
    }
    return metrics


y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_metrics = calculate_metrics(y_train, y_train_pred, 'train')
test_metrics = calculate_metrics(y_test, y_test_pred, 'test')

output_path = "../files/output/metrics.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)  

with open(output_path, 'w') as f:
    f.write(json.dumps(train_metrics) + '\n')  
    f.write(json.dumps(test_metrics) + '\n')  

print(f"Métricas guardadas en: {output_path}")

Métricas guardadas en: ../files/output/metrics.json


In [16]:
#paso 7 
from sklearn.metrics import confusion_matrix

def calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

   
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    
    def format_confusion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0, 0]),
                'predicted_1': int(cm[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm[1, 0]),
                'predicted_1': int(cm[1, 1])
            }
        }

    metrics = [
        format_confusion_matrix(cm_train, 'train'),
        format_confusion_matrix(cm_test, 'test')
    ]

    
    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f:  
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')


def main(model, X_train, X_test, y_train, y_test):
    
    import os
    os.makedirs('../files/output', exist_ok=True)

    
    train_metrics = calculate_metrics(y_train, y_train_pred, 'train')
    test_metrics = calculate_metrics(y_test, y_test_pred, 'test')

    calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test)

main(model, x_train, x_test, y_train, y_test)
