In [5]:

import pandas as pd
import numpy as np
import os 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score, balanced_accuracy_score,
    confusion_matrix
)

In [8]:
import gzip
import json
import os
import pickle

import pandas as pd  # type: ignore

In [9]:
MODEL_FILENAME = "files/models/model.pkl.gz"
MODEL_COMPONENTS = [
    "OneHotEncoder",
    "RandomForestClassifier",
]
SCORES = [
    0.785,
    0.673,
]
METRICS = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": 0.944,
        "balanced_accuracy": 0.785,
        "recall": 0.580,
        "f1_score": 0.719,
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": 0.650,
        "balanced_accuracy": 0.673,
        "recall": 0.401,
        "f1_score": 0.498,
    },
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": 16060, "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": 2740},
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": 6670, "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": 760},
    },
]


In [6]:
def _load_model():
    """Generic test to load a model"""
    assert os.path.exists(MODEL_FILENAME)
    with gzip.open(MODEL_FILENAME, "rb") as file:
        model = pickle.load(file)
    assert model is not None
    return model

In [11]:
model = _load_model()

AssertionError: 

In [9]:
import os
print("Working directory:", os.getcwd())

Working directory: c:\Users\valen\Documents\Maestria\Analitica Predictiva\LAB-01-prediccion-del-default-usando-rf-vatamayog\homework


In [10]:
os.chdir("..")
os.getcwd()

'c:\\Users\\valen\\Documents\\Maestria\\Analitica Predictiva\\LAB-01-prediccion-del-default-usando-rf-vatamayog'

 # Paso 1.
 Realice la limpieza de los datasets:
 - Renombre la columna "default payment next month" a "default".
 - Remueva la columna "ID".
 - Elimine los registros con informacion no disponible.
 - Para la columna EDUCATION, valores > 4 indican niveles superiores

In [11]:
# ============================
# Paso 1. Cargar y limpiar datos
# ============================


def load_data(Pathfile1, Pathfile2=None):
    import pandas as pd

    if Pathfile2 is not None:
        train_df = pd.read_csv(Pathfile1, compression="zip")
        test_df = pd.read_csv(Pathfile2, compression="zip")
        return train_df, test_df
    else:
        df = pd.read_csv(Pathfile1, compression="zip")
        return df

load_data("files/input/train_data.csv.zip","files/input/test_data.csv.zip")

(          ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  \
 0      10748     310000    1          3         1   32      0      0      0   
 1      12574      10000    2          3         1   49     -1     -1     -2   
 2      29677      50000    1          2         1   28     -1     -1     -1   
 3       8857      80000    2          3         1   52      2      2      3   
 4      21099     270000    1          1         2   34      1      2      0   
 ...      ...        ...  ...        ...       ...  ...    ...    ...    ...   
 20995  27956     140000    2          2         1   27      2     -1     -1   
 20996  27108     130000    1          2         2   41      0      0      0   
 20997     26      50000    1          3         2   23      0      0      0   
 20998  14778      90000    2          3         2   25      0      0      0   
 20999  20634     120000    1          2         2   31      0      0      0   
 
        PAY_4  ...  BILL_AMT4  BILL_AM

In [12]:
train_df, test_df = load_data("files/input/train_data.csv.zip","files/input/test_data.csv.zip")
train_df.head()
test_df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [13]:
import pandas as pd
import zipfile
import numpy as np


def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Limpieza y transformación del dataset 

    Args:
        df (pd.DataFrame): DataFrame original.

    Returns:
        pd.DataFrame: DataFrame limpio.
    """

    # Renombrar variable objetivo
    if "default payment next month" in df.columns:
        df.rename(columns={"default payment next month": "default"}, inplace=True)

    # Eliminar columna ID si existe
    df.drop(columns=["ID"], inplace=True, errors="ignore")

    # Eliminar registros con info no disponible
    df["EDUCATION"] = df["EDUCATION"].replace(0, np.nan)

    # Mapear EDUCATION a categorías
    def map_education(x):
        if x == 1:
            return "graduate_school"
        elif x == 2:
            return "university"
        elif x == 3:
            return "high_school"
        else:
            return "others"

    df["EDUCATION"] = df["EDUCATION"].apply(map_education)

    return df


# Limpiar datasets

train_df = clean_dataset(train_df)
test_df = clean_dataset(test_df)



# Paso 2.
Divida los datasets en x_train, y_train, x_test, y_test.

In [14]:
def split_features_target(df: pd.DataFrame, target_col):
    """
    Separa las variables explicativas y la variable objetivo 'default'.

    Args:
        df (pd.DataFrame): DataFrame limpio.

    Returns:
        x (pd.DataFrame): Variables explicativas.
        y (pd.Series): Variable objetivo.
    """
    x = df.drop(columns=[target_col])
    y = df[target_col]
    return x, y


In [15]:
x_train, y_train = split_features_target(train_df, "default")
x_test, y_test = split_features_target(test_df, "default")


print("Tamaños:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)


Tamaños:
x_train: (21000, 23)
y_train: (21000,)
x_test: (9000, 23)
y_test: (9000,)


# Paso 3.
 Cree un pipeline para el modelo de clasificación. Este pipeline debe
 contener las siguientes capas:
 - Transforma las variables categoricas usando el método
   one-hot-encoding.
 - Ajusta un modelo de bosques aleatorios (rando forest).

In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   LIMIT_BAL  21000 non-null  int64 
 1   SEX        21000 non-null  int64 
 2   EDUCATION  21000 non-null  object
 3   MARRIAGE   21000 non-null  int64 
 4   AGE        21000 non-null  int64 
 5   PAY_0      21000 non-null  int64 
 6   PAY_2      21000 non-null  int64 
 7   PAY_3      21000 non-null  int64 
 8   PAY_4      21000 non-null  int64 
 9   PAY_5      21000 non-null  int64 
 10  PAY_6      21000 non-null  int64 
 11  BILL_AMT1  21000 non-null  int64 
 12  BILL_AMT2  21000 non-null  int64 
 13  BILL_AMT3  21000 non-null  int64 
 14  BILL_AMT4  21000 non-null  int64 
 15  BILL_AMT5  21000 non-null  int64 
 16  BILL_AMT6  21000 non-null  int64 
 17  PAY_AMT1   21000 non-null  int64 
 18  PAY_AMT2   21000 non-null  int64 
 19  PAY_AMT3   21000 non-null  int64 
 20  PAY_AMT4   21000 non-null  i

In [9]:
# Verificar nulos

print("Nulos en x_train:\n", x_train.isnull().sum())
print("Nulos en x_test:\n", x_test.isnull().sum())

Nulos en x_train:
 LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
dtype: int64
Nulos en x_test:
 LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
dtype: int64


In [16]:
def make_pipeline(list_categorical, estimator):

    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    transformer = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore", dtype="int"), list_categorical)
        ],
        remainder="passthrough"  # Dejar las columnas no categoricas sin cambios
    )

    pipeline = Pipeline(
        steps=[
            ("transformer", transformer),
            ("classifier", estimator)
        ],  
        verbose=False
    )

    return pipeline

#  Paso 4.
- Optimice los hiperparametros del pipeline usando validación cruzada.
- se 10 splits para la validación cruzada. Use la función de precision
- balanceada para medir la precisión del modelo.


In [20]:
def make_grid_search(estimator, param_grid, cv, score):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring=score,
        n_jobs=-1,
        verbose=1
    )

    return grid_search

In [22]:
def save_estimator(estimator, output_path):

    import pickle

    with open(output_path, "wb") as file:
        pickle.dump(estimator, file)
    print(f"Modelo guardado en: {output_path}")


def load_estimator(output_path):

    import os
    import pickle

    if not os.path.exists(output_path):
        return None
    with open(output_path, "rb") as file:
        estimator = pickle.load(file)

    return estimator

In [36]:
list_categorical = ["SEX", "EDUCATION", "MARRIAGE"]
pipeline = make_pipeline(list_categorical, RandomForestClassifier(random_state=42))

# Definir los hiperparámetros a optimizar
param_grid = {
    #"classifier__n_estimators": [50, 100, 200],
    #"classifier__max_depth": [None, 10, 20, 30],
    #"classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4]
}


In [41]:
def train_logistic_regression(pipeline, file_output): 

    from sklearn.linear_model import LogisticRegression


    estimator = make_grid_search(pipeline, param_grid, cv=10, score="balanced_accuracy")

    estimator.fit(x_train, y_train)

    # Mostrar los mejores hiperparámetros 
    print("Mejores hiperparámetros:", estimator.best_params_)
    print("Mejor puntaje de validación cruzada:", estimator.best_score_)

    save_estimator(estimator, file_output)
    return estimator

In [45]:
model = train_logistic_regression(pipeline, "files/models/model.pkl.gz")

Fitting 10 folds for each of 3 candidates, totalling 30 fits
Mejores hiperparámetros: {'classifier__min_samples_leaf': 2}
Mejor puntaje de validación cruzada: 0.6568097051442583
Modelo guardado en: files/models/model.pkl.gz


In [None]:
model1 = load_estimator(output_path = "files/models/model.pkl.gz")

In [46]:
current_components = [str(model.estimator[i]) for i in range(len(model.estimator))]
current_components

["ColumnTransformer(remainder='passthrough',\n                  transformers=[('cat',\n                                 OneHotEncoder(dtype='int',\n                                               handle_unknown='ignore'),\n                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])",
 'RandomForestClassifier(random_state=42)']

In [41]:
best_model = model

# Paso 5.
- Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
- Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [None]:
# =====================
# Paso 5: Guardar el modelo
# =====================

import gzip
import pickle

# Ruta donde se guardará el modelo
output_path = "files/models/model.pkl.gz"

# Guardar el modelo comprimido
with gzip.open(output_path, "wb") as f:
    pickle.dump(estimator, f)

print(f"Modelo guardado en: {output_path}")

Modelo guardado en: files/models/model.pkl.gz


# Paso 6.
 Calcule las metricas de precision, precision balanceada, recall,
 y f1-score para los conjuntos de entrenamiento y prueba.
 Guardelas en el archivo files/output/metrics.json. Cada fila
 del archivo es un diccionario con las metricas de un modelo.
 Este diccionario tiene un campo para indicar si es el conjunto
 de entrenamiento o prueba. Por ejemplo:

 {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
 {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

In [14]:
import pickle
import gzip

# cargar modelo PKL con el mismo nombre de best_model

with gzip.open("files/models/model.pkl.gz", "rb") as f:
   best_model = pickle.load(f)

In [49]:

from sklearn.metrics import (
    precision_score, recall_score, f1_score, balanced_accuracy_score,
    confusion_matrix
)

def eval_metrics(true, pred):

    metrics = {
    "type": "metrics",
    "dataset": "train",
    "precision": precision_score(true, pred, average="binary"),
    "balanced_accuracy": balanced_accuracy_score(true, pred),
    "recall": recall_score(true, pred, average="binary"),
    "f1_score": f1_score(true, pred, average="binary")
}
    return metrics

    # Calcular métricas para el conjunto de entrenamiento



def save_metrics(train_metrics, test_metrics, output_path):
    # Guardar las métricas en un archivo JSON, cada una en una línea separada
    with open(output_path, "w") as f:
        f.write(json.dumps(train_metrics) + "\n")
        f.write(json.dumps(test_metrics) + "\n")

    print(f"Métricas guardadas en: {output_path}")
import json


# Paso 7.
 Calcule las matrices de confusion para los conjuntos de entrenamiento y
 prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
 del archivo es un diccionario con las metricas de un modelo.
 de entrenamiento o prueba. Por ejemplo:

 {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
 {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}


In [44]:
# =====================
# Paso 7: Calcular matrices de confusión y guardarlas en el archivo JSON
# =====================

# Ruta del archivo de métricas
def save_matrix_confusion(y_train, pred, y_test, y_test_pred):
    output_path = "files/output/metrics.json"

    # Leer el contenido existente del archivo JSON
    metrics = []
    if os.path.exists(output_path):
        with open(output_path, "r") as f:
            metrics = [json.loads(line) for line in f]

    # Calcular la matriz de confusión para el conjunto de entrenamiento
    train_cm = confusion_matrix(y_train, pred)
    train_cm_dict = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(train_cm[0, 0]), "predicted_1": int(train_cm[0, 1])},
        "true_1": {"predicted_0": int(train_cm[1, 0]), "predicted_1": int(train_cm[1, 1])}
    }

    # Calcular la matriz de confusión para el conjunto de prueba
    test_cm = confusion_matrix(y_test, y_test_pred)
    test_cm_dict = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(test_cm[0, 0]), "predicted_1": int(test_cm[0, 1])},
        "true_1": {"predicted_0": int(test_cm[1, 0]), "predicted_1": int(test_cm[1, 1])}
    }

    # Agregar las matrices de confusión a las métricas existentes
    metrics.append(train_cm_dict)
    metrics.append(test_cm_dict)

    # Guardar todas las métricas (incluyendo las matrices de confusión) en el archivo JSON
    with open(output_path, "w") as f:
        for metric in metrics:
            f.write(json.dumps(metric) + "\n")

    print(f"Matrices de confusión agregadas en: {output_path}")

In [50]:
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

train_metrics = eval_metrics(y_train, y_train_pred)
test_metrics = eval_metrics(y_test, y_test_pred)