In [1]:
# Importamos librerías necesarias
import os
import pandas as pd
import gzip
import json
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

In [2]:
import functions
df_train, df_test = functions.load_and_process_data()
x_train, y_train, x_test, y_test = functions.split_features_target(df_train, df_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((20953, 23), (20953,), (8979, 23), (8979,))

In [28]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA

def create_pipeline():
    categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]
    numeric_cols = [c for c in x_train.columns if c not in categorical_cols]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(), categorical_cols),
            ("num", "passthrough", numeric_cols),
        ]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("pca", PCA()),
            ("scale", StandardScaler()),
            ("select", SelectKBest(score_func=f_classif)),
            ("mlp", MLPClassifier(
                max_iter=10000,
                random_state=42,
                early_stopping=True,
                n_iter_no_change=10,
                validation_fraction=0.1,
            )),
        ]
    )
    
    return pipeline

In [25]:
def make_grid_search(pipeline, x_train, y_train):
    # Búsqueda de hiperparámetros
    param_grid = {
        'pca__n_components': [None],
        'select__k': [10, 15, 20],
        'mlp__hidden_layer_sizes': [(50,), (100,), (150,), (100, 50)],
        'mlp__activation': ['relu', 'tanh'],
        'mlp__solver': ['adam', 'sgd'],
        'mlp__alpha': [0.0001, 0.001],
    }

    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="balanced_accuracy",
        cv=10,
        n_jobs=9,
    )
    grid_search.fit(x_train, y_train)
    
    return grid_search

In [19]:
def save_estimator(estimator):
    models_path = "../files/models"
    os.makedirs(models_path, exist_ok=True)

    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)

In [26]:
# Paso 6: Calculando métricas de evaluación
def calc_metrics(model, x_train, y_train, x_test, y_test):

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return metrics

In [21]:
# Paso 7: Guardamos las metricas
def save_metrics(metrics):
    metrics_path = "../files/output"
    os.makedirs(metrics_path, exist_ok=True)
    
    with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')

In [29]:
pipeline = create_pipeline()
model = make_grid_search(pipeline, x_train, y_train)
save_estimator(model)
metrics = calc_metrics(model, x_train, y_train, x_test, y_test)
save_metrics(metrics)

print(model.best_estimator_)
print(model.best_params_)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('num', 'passthrough',
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                                                   'PAY_AMT5', 'PAY_AMT6'])])),

In [23]:
# Traemos el modelo guardado
with gzip.open("../files/models/model.pkl.gz", "rb") as file:
    model = pickle.load(file)
metrics = calc_metrics(model, x_train, y_train, x_test, y_test)
save_metrics(metrics)

print(model.best_estimator_)
print(model.best_params_)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE']),
                                                 ('num', StandardScaler(),
                                                  ['LIMIT_BAL', 'AGE', 'PAY_0',
                                                   'PAY_2', 'PAY_3', 'PAY_4',
                                                   'PAY_5', 'PAY_6',
                                                   'BILL_AMT1', 'BILL_AMT2',
                                                   'BILL_AMT3', 'BILL_AMT4',
                                                   'BILL_AMT5', 'BILL_AMT6',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                                                   'PAY_AMT5', 'PAY_AMT6'])]