In [38]:
def save_estimator_compressed(estimator, file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle
    # Asegurarse de que el directorio exista
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Guardar el modelo comprimido
    with gzip.open(file_path, "wb") as file:
        pickle.dump(estimator, file)

def load_estimator_compressed(file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle
    try:
        # Asegurarse de que el directorio exista
        # os.makedirs(os.path.dirname(file_path), exist_ok=True)
        
        # Verificar si el archivo existe antes de intentar abrirlo
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"El archivo {file_path} no se encuentra.")
        
        # Abrir el archivo comprimido en modo de lectura binaria
        with gzip.open(file_path, "rb") as file:
            estimator = pickle.load(file)
        
        return estimator

    except Exception as e:
        print(f"Ocurrió un error al cargar el modelo: {e}")
        return None

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [40]:
dataset = pd.read_csv("../files/input/train_data.csv.zip")
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [41]:
# - Renombre la columna "default payment next month" a "default".
dataset.rename(columns={"default payment next month": "default"}, inplace=True)

# - Remueva la columna "ID".
dataset.drop(columns="ID", inplace=True)
# - Elimine los registros con informacion no disponible.
dataset.dropna(inplace=True)  # borra las filas con NaN

# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
dataset.loc[dataset["EDUCATION"] > 4, "EDUCATION"] = 4

dataset.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [42]:
dataset.default.value_counts()

default
0    16273
1     4727
Name: count, dtype: int64

In [43]:
dataset.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default'],
      dtype='object')

In [44]:
# Divida los datasets en x_train, y_train, x_test, y_test.

X = dataset.drop(columns="default")
y = dataset["default"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Identificar las columnas categóricas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

# Crear el transformador para las variables categóricas
categorical_transformer = OneHotEncoder()

# Crear el preprocesador que aplica el transformador a las columnas categóricas
preprocessor = ColumnTransformer(
  transformers=[
    ('cat', categorical_transformer, categorical_features)
  ],
  remainder='passthrough'  # Dejar las columnas no transformadas tal cual
)

# Crear el pipeline que incluye el preprocesador y el modelo de bosques aleatorios
pipeline = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('classifier', RandomForestClassifier(random_state=42))
])

# Ajustar el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

# Evaluar el pipeline en los datos de prueba
score = pipeline.score(X_test, y_test)
print(f'Accuracy: {score:.4f}')


Accuracy: 0.8162


In [61]:
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score

# Crear el grid de hiperparámetros
param_grid = {
  'classifier__n_estimators': [100, 200, 300],
  'classifier__max_depth': [5, 10, 15, 20],
  'classifier__min_samples_split': [2, 5, 10],
  'classifier__min_samples_leaf': [1, 2, 4]
}

# Crear el objeto de búsqueda en cuadrícula
grid_search = GridSearchCV(
  pipeline,  # Modelo a ajustar
  param_grid,  # Cuadrícula de hiperparámetros
  cv=10,  # Número de splits para la validación cruzada
  scoring=make_scorer(balanced_accuracy_score),  # Métrica de evaluación
  n_jobs=-1
)

# Ajustar el objeto de búsqueda en cuadrícula a los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo y su precisión
estimator = grid_search.best_estimator_  # best estimator
best_score = grid_search.best_score_
print(f'Best Balanced Accuracy: {best_score:.4f}')

# Evaluar el mejor modelo en los datos de prueba
best_score_test = estimator.score(X_test, y_test)
print(f'Best Accuracy: {best_score_test:.4f}')

save_estimator_compressed(grid_search)

KeyboardInterrupt: 

In [60]:
grid_search.score(X_test, y_test)

np.float64(0.6594666928783754)

In [49]:
best_estimator = load_estimator_compressed()

if best_estimator is not None:

    saved_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=best_estimator.predict(X_test)
    )

    current_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=grid_search.predict(X_test)
    )

    if current_balanced_accuracy < saved_balanced_accuracy:
        grid_search = best_estimator

# 5. Guardar el modelo
save_estimator_compressed(grid_search)

In [53]:
best_estimator = load_estimator_compressed()
best_estimator.score(X_test, y_test)

np.float64(0.6539361762482336)

In [55]:
def calculate_and_save_metrics(model, x_train, x_test, y_train, y_test):
    from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    
    )
    import json
    import os
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        }
    ]

    os.makedirs("../files/output", exist_ok=True)
    with open("../files/output/metrics.json", "w") as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

def calculate_and_save_confusion_matrices(model, x_train, x_test, y_train, y_test):
    import json
    from sklearn.metrics import confusion_matrix

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    matrices = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    with open("../files/output/metrics.json", "a") as f:
        for matrix in matrices:
            f.write(json.dumps(matrix) + '\n')

In [56]:
calculate_and_save_metrics(estimator, X_train, X_test, y_train, y_test)

# 6. Calculo de matrices
calculate_and_save_confusion_matrices(estimator, X_train, X_test, y_train, y_test)

In [57]:
import pandas as pd
# Lee el archivo JSON con múltiples objetos en líneas separadas
metrica = pd.read_json("../files/output/metrics.json", lines=True)
metrica

Unnamed: 0,type,dataset,precision,balanced_accuracy,recall,f1_score,true_0,true_1
0,metrics,train,1.0,0.998685,0.997369,0.998683,,
1,metrics,test,0.651163,0.653936,0.362851,0.466019,,
2,cm_matrix,train,,,,,"{'predicted_0': 12999, 'predicted_1': 0}","{'predicted_0': 10, 'predicted_1': 3791}"
3,cm_matrix,test,,,,,"{'predicted_0': 3094, 'predicted_1': 180}","{'predicted_0': 590, 'predicted_1': 336}"
