In [1]:
#Importar librerias
import pandas as pd
import os
import pickle
import json
import gzip
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [10]:
#Funciones
def load_data():
    # Define las columnas categóricas
    global categorical_features
    categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE'] 
    global numeric_features 
    numeric_features = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
        'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    
    train = pd.read_csv(
        "../files/input/train_data.csv.zip",
        index_col=False,
        compression="zip",
    )
    test = pd.read_csv(
        "../files/input/test_data.csv.zip",
        index_col=False,
        compression="zip",
    )
    return train, test

def clear_data(df):
    #Renombrar
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    #Eliminacion de columna
    df = df.drop("ID", axis=1)
    #Eliminacion elementos nulos 
    df.dropna(inplace=True)
    #Cambia valores de educacion mayores a 4
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: x if x<=4 else 4)
    return df

def make_train_test_split(df):
    #Division en etiquetas 
    y_df =  df["default"]
    #Division en caracteristicas de entrada
    x_df = df.drop("default", axis=1)
    return x_df, y_df


def cross_validation(pipeline, param_grid, x_train, y_train):
    #Evaluacion de hiperparametros
    model = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        cv = 10,
        scoring="balanced_accuracy", #"f1_score"
    )
    #Aplicacion de GridSearchCV
    model.fit(x_train, y_train)
    return model

def save_grid_search_model(model):
    #Guardar mejor modelo
    if not os.path.exists("../files/models"):
        os.makedirs("../files/models")
    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(model, file)

def eval_metrics(type_dataset, y_true, y_pred):
    #         | Pronóstico
    #         |  PP    PN
    #---------|------------
    #      P  |  TP    FN
    # Real    |
    #      N  |  FP    TN

    #(1/2)*(TP/P + TN/N)
    b_accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred,)
    #TP/(TP + FP)
    precision = precision_score(
        y_true=y_true,
        y_pred=y_pred, 
        labels=None, 
        pos_label=1,
        average="binary",)
    #TP/(TP + FN)
    recall = recall_score(
        y_true=y_true,
        y_pred=y_pred, 
        labels=None, 
        pos_label=1,
        average="binary",)
    #2*(precision*recall)/(precision + recall)
    f1 = f1_score(
        y_true=y_true,
        y_pred=y_pred,
        labels=None,
        pos_label=1,
        average="binary",
        sample_weight=None,
        zero_division="warn",)

    #Formar diccionario de metricas 
    dic_metrics = { "type": "metrics",
                   'dataset': type_dataset, 
                   'precision': precision , 
                   'balanced_accuracy': b_accuracy, 
                   'recall': recall, 
                   'f1_score': f1}
    print(dic_metrics)
    #Guardar metricas como archivo json
    if not os.path.exists("../files/output"):
        os.makedirs("../files/output")
    with open("../files/output/metrics.json", "a") as f:
        json.dump(dic_metrics, f)
        f.write("\n")

def eval_confusion_matrix(type_dataset, y_true, y_pred):
    #         | Pronóstico
    #         |  PP    PN
    #---------|------------
    #      P  |  TP    FN
    # Real    |
    #      N  |  FP    TN

    tn, fp, fn, tp = confusion_matrix(y_true=y_true, y_pred=y_pred,).ravel()

    #Formar diccionario de metricas 
    dic_confusion = {'type': 'cm_matrix', 'dataset': type_dataset, 
                   'true_0': {"predicted_0": int(tn), "predicte_1": int(fp)}, 
                   'true_1': {"predicted_0": int(fn), "predicted_1": int(tp)}}
    print(dic_confusion)
    #Guardar metricas como archivo json
    if not os.path.exists("../files/output"):
        os.makedirs("../files/output")
    with open("../files/output/metrics.json", "a") as f:
        json.dump(dic_confusion, f)
        f.write("\n")

#---------------------------FUCTION SET------------------------------------------
def dataset_manipulation():
    #Carga de datos
    train, test = load_data()
    #Limpieza de datos
    train = clear_data(train)
    test = clear_data(test)
    #Division en etiquetas y caracteristicas de entrada
    x_train, y_train = make_train_test_split(train)
    x_test, y_test = make_train_test_split(test)
    return x_train, y_train, x_test, y_test

def eval_model(model, x_train, y_train, x_test, y_test):
    if os.path.exists("../files/output/metrics.json"):
        os.remove("../files/output/metrics.json")
    # Calculo de métricas
    eval_metrics("train", y_train, y_pred=model.best_estimator_.predict(x_train))
    eval_metrics("test", y_test, y_pred=model.best_estimator_.predict(x_test))
    # Calculo matriz de confusión
    eval_confusion_matrix("train", y_train, y_pred=model.best_estimator_.predict(x_train))
    eval_confusion_matrix("test", y_test, y_pred=model.best_estimator_.predict(x_test))
    

In [20]:
#------------------------------MODEL------------------------------------------
def train_model(x_train, y_train):  
    #----------------------PIPELINE------------------------------
    # Crea el preprocesador
    preprocessor = ColumnTransformer(
        transformers=[
            #Ej. df[Sex]-->"1","2" por tanto la codificacion de esa columna sera un array de bit
            # "1"-->[1,0] "2"-->[0,1]
            ('one', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            #Escala los valores numericos en un rango de 0 y 1
            ("minmax", MinMaxScaler(feature_range=(0, 1)), numeric_features)
        ],
        remainder='passthrough'  # Deja las columnas numéricas igual
    )
    #Contruccion pipeline
    pipeline = make_pipeline(
    preprocessor,
    SelectKBest(k=15),
    LogisticRegression(),
    )
    #-------------------------PARAMETROS GRID-----------------------------
    #Definicion de hiperparametros a evualuar 
    paramters_grid = {
    'selectkbest__k': range(12, 18),  # Número de características a seleccionar
    'logisticregression__solver': ['lbfgs', 'saga'],
    'logisticregression__tol': [0.001],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.01, 1, 10],
    'logisticregression__class_weight': [{0: 1, 1: 1.24}]
    #'logisticregression__class_weight': [{0: 1, 1: 1.2},{0: 1, 1: 1.4},{0: 1, 1: 1.6},
    #                                     {0: 1, 1: 1.8},{0: 1, 1: 2}],
    }
    #----------------------TRAIN CROSS-----------------------
    model = cross_validation(pipeline=pipeline,
                             param_grid=paramters_grid,
                             x_train=x_train,
                             y_train=y_train
                             )
    return model


Ejecución del flujo principal para el dataset predefinido

In [13]:
#Carga y manipulacion de datos 
x_train, y_train, x_test, y_test = dataset_manipulation()

In [14]:
#Al verificar la cantidad de elementos de cada clase presentes en el dataset, 
#se determina que hay un desbalance entre las mismas, por tanto toca aplicar balanceo de la misma.
#Clase 0: 16,273 (~77%)
#Clase 1: 4,727 (~23%)
y_train.value_counts()

default
0    16273
1     4727
Name: count, dtype: int64

In [21]:
#Definicion y entrenamiento de modelo
model = train_model(x_train, y_train)  
#Informacion del mejor modelo y ademas definirlo
print(model.best_score_)
print(model.best_params_)

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "b:\Documentos\Universidad-Materias\Fundamentos_Analítica\LAB-10-prediccion-del-default-usando-logreg-juloperag\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "b:\Documentos\Universidad-Materias\Fundamentos_Analítica\LAB-10-prediccion-del-default-usando-logreg-juloperag\.venv\lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "b:\Documentos\Universidad-Materias\Fundamentos_Analítica\LAB-10-prediccion-del-d

0.6398079420098922
{'logisticregression__C': 10, 'logisticregression__class_weight': {0: 1, 1: 1.24}, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga', 'logisticregression__tol': 0.001, 'selectkbest__k': 17}




In [23]:
#Salvar mejor model
save_grid_search_model(model)

In [24]:
#Evaluacion del Modelo con diversas metricas
eval_model(model, x_train, y_train, x_test, y_test)

{'type': 'metrics', 'dataset': 'train', 'precision': 0.6887778282598819, 'balanced_accuracy': 0.6393082718312573, 'recall': 0.32071081023905224, 'f1_score': 0.4376443418013857}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.654079064505956, 'recall': 0.34834992142482973, 'f1_score': 0.4651976215459951}
{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 15588, 'predicte_1': 685}, 'true_1': {'predicted_0': 3211, 'predicted_1': 1516}}
{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {'predicted_0': 6806, 'predicte_1': 285}, 'true_1': {'predicted_0': 1244, 'predicted_1': 665}}
