In [12]:
import pandas as pd
import numpy as np
import pickle

df1 = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
df1_ = df1.copy()
df1_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          21000 non-null  int64
 1   LIMIT_BAL                   21000 non-null  int64
 2   SEX                         21000 non-null  int64
 3   EDUCATION                   21000 non-null  int64
 4   MARRIAGE                    21000 non-null  int64
 5   AGE                         21000 non-null  int64
 6   PAY_0                       21000 non-null  int64
 7   PAY_2                       21000 non-null  int64
 8   PAY_3                       21000 non-null  int64
 9   PAY_4                       21000 non-null  int64
 10  PAY_5                       21000 non-null  int64
 11  PAY_6                       21000 non-null  int64
 12  BILL_AMT1                   21000 non-null  int64
 13  BILL_AMT2                   21000 non-null  int64
 14  BILL_A

In [13]:
df2 = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
df2_=df2.copy()
df2_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          9000 non-null   int64
 1   LIMIT_BAL                   9000 non-null   int64
 2   SEX                         9000 non-null   int64
 3   EDUCATION                   9000 non-null   int64
 4   MARRIAGE                    9000 non-null   int64
 5   AGE                         9000 non-null   int64
 6   PAY_0                       9000 non-null   int64
 7   PAY_2                       9000 non-null   int64
 8   PAY_3                       9000 non-null   int64
 9   PAY_4                       9000 non-null   int64
 10  PAY_5                       9000 non-null   int64
 11  PAY_6                       9000 non-null   int64
 12  BILL_AMT1                   9000 non-null   int64
 13  BILL_AMT2                   9000 non-null   int64
 14  BILL_AMT

In [14]:
def preprocess_data(df):

    df = df.rename(columns={'default payment next month': 'default'})
    df = df.drop(columns=['ID'])
    df = df.dropna() 

    #df['EDUCATION'] = df['EDUCATION'].apply(lambda x: x if x > 4 else x)
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 0 if int(x) > 3 else int(x))
    df = df.loc[df['EDUCATION'] != 0]
    df = df.loc[df['MARRIAGE'] != 0]
    df['EDUCATION'] = df['EDUCATION']
    df['MARRIAGE'] = df['MARRIAGE']

    return df
df1 = preprocess_data(df1)
df2 = preprocess_data(df2)
df1['EDUCATION'].unique()

array([3, 2, 1])

In [15]:
def make_train_test_split(train_data, test_data, target_column):

    # Separar características (X) y etiquetas (y) del conjunto de entrenamiento
    x_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    
    # Separar características (X) y etiquetas (y) del conjunto de prueba
    x_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]
    
    return x_train, x_test, y_train, y_test

# Llamar a la función con tus datos
x_train, x_test, y_train, y_test = make_train_test_split(
    train_data=df1,
    test_data=df2,
    target_column='default'
)

In [16]:
def make_pipeline(estimator):

    from sklearn.compose import ColumnTransformer
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    
    categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    transformer = ColumnTransformer(
        transformers=[
           ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ],
        remainder=StandardScaler(),
    )

    selectkbest = SelectKBest(score_func=f_classif)

    pipeline = Pipeline(
        steps=[
            ("tranformer", transformer),
            ("selectkbest", selectkbest),
            ("estimator", estimator),
        ],
        verbose=False,
    )

    return pipeline

In [17]:
def make_grid_search(estimator, param_grid, cv=10):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring='balanced_accuracy',
        n_jobs=-1
    )

    return grid_search

In [18]:
def save_estimator(estimator):
    import os
    import gzip
    import pickle

    # Crear la carpeta si no existe
    os.makedirs("../files/models", exist_ok=True)

    # Guardar el modelo con compresión gzip
    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)


In [19]:
def load_estimator():
    import os
    import gzip
    import pickle

    # Verificar si el archivo comprimido existe
    if os.path.exists("../files/models/model.pkl.gz"):
        with gzip.open("../files/models/model.pkl.gz", "rb") as file:
            return pickle.load(file)

    # Verificar si el archivo no comprimido existe
    if os.path.exists("../files/models/model.pkl"):
        with open("../files/models/model.pkl", "rb") as file:
            return pickle.load(file)

    # Si no existe ningún archivo, devolver None
    return None


In [20]:
def train_RandomForestClassifier():

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import mean_absolute_error

    # data, target = load_data()

    x_train, x_test, y_train, y_test = make_train_test_split(
            train_data=df1,
            test_data=df2,
            target_column='default'
    )

    pipeline = make_pipeline(
        estimator=RandomForestClassifier(random_state=42 ,class_weight="balanced"),
    )

    estimator = make_grid_search(
        estimator=pipeline,
        param_grid={
                'estimator__n_estimators': [500],
                'estimator__max_depth': [230],
                'estimator__min_samples_split': [6],
                'estimator__min_samples_leaf': [2],
                
        },
        cv=10,
    )

    estimator.fit(x_train, y_train)
    print(estimator.fit(x_train, y_train))
    best_estimator = load_estimator()

    if best_estimator is not None:

        saved_mae = mean_absolute_error(
            y_true=y_test, y_pred=best_estimator.predict(x_test)
        )

        current_mae = mean_absolute_error(
            y_true=y_test, y_pred=estimator.predict(x_test)
        )

        if saved_mae < current_mae:
            estimator = best_estimator

    save_estimator(estimator)


train_RandomForestClassifier()

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tranformer',
                                        ColumnTransformer(remainder=StandardScaler(),
                                                          transformers=[('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE'])])),
                                       ('selectkbest', SelectKBest()),
                                       ('estimator',
                                        RandomForestClassifier(class_weight='balanced',
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'estimator__max_depth': [230],
            

In [21]:
# import os
# import json
# from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

# def save_metrics_and_confusion_matrices(estimator, x_train, y_train, x_test, y_test):
#     """
#     Genera las métricas y matrices de confusión y las guarda en 'files/output/metrics.json'.
#     """
#     if estimator is None:
#         raise ValueError("El modelo no se pudo cargar. Asegúrate de haber guardado el modelo correctamente.")
    
#     # Predicciones
#     y_train_pred = estimator.predict(x_train)
#     y_test_pred = estimator.predict(x_test)

# # Calcular métricas para la clase minoritaria (clase 1)
#     metrics_train = {
#     "type": "metrics",
#     "dataset": "train",
#     "precision": precision_score(y_train, y_train_pred, pos_label=0),
#     "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
#     "recall": recall_score(y_train, y_train_pred, pos_label=0),
#     "f1_score": f1_score(y_train, y_train_pred, pos_label=0)
#     }

#     metrics_test = {
#     "type": "metrics",
#     "dataset": "test",
#     "precision": precision_score(y_test, y_test_pred, pos_label=0),
#     "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
#     "recall": recall_score(y_test, y_test_pred, pos_label=0),
#     "f1_score": f1_score(y_test, y_test_pred, pos_label=0)
#     }

#     # Calcular matrices de confusión
#     def cm_to_dict(cm, dataset_name):
#         return {
#             "type": "cm_matrix",
#             "dataset": dataset_name,
#             "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
#             "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
#         }
    
#     cm_train = confusion_matrix(y_train, y_train_pred)
#     cm_test = confusion_matrix(y_test, y_test_pred)
    
#     cm_train_dict = {
#         "type": "cm_matrix",
#         "dataset": "train",
#         "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
#         "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
#     }
#     cm_test_dict = {
#         "type": "cm_matrix",
#         "dataset": "test",
#         "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
#         "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
#     }

#     # Guardar métricas y matrices en un archivo JSON
#     output_directory = os.path.abspath("../files/output")
#     output_file = "metrics.json"
#     output_path = os.path.join(output_directory, output_file)

#     if not os.path.exists(output_directory):
#         os.makedirs(output_directory, exist_ok=True)

#     try:
#         with open(output_path, "w", encoding="utf-8") as file:
#             file.write(json.dumps(metrics_train) + "\n")
#             file.write(json.dumps(metrics_test) + "\n")
#             file.write(json.dumps(cm_train_dict) + "\n")
#             file.write(json.dumps(cm_test_dict) + "\n")
#         print(f"Métricas y matrices de confusión guardadas correctamente en: {output_path}")
#     except Exception as e:
#         print(f"Error al guardar las métricas y matrices de confusión: {e}")
    
#     # Imprimir el reporte de clasificación para verificación
#     print("Reporte de clasificación (entrenamiento):")
#     print(classification_report(y_train, y_train_pred))
#     print("\nReporte de clasificación (prueba):")
#     print(classification_report(y_test, y_test_pred))


# # Ejemplo de uso
# estimator = load_estimator()
# save_metrics_and_confusion_matrices(estimator, x_train, y_train, x_test, y_test)


In [22]:
import os
import json
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

def save_metrics_and_confusion_matrices(estimator, x_train, y_train, x_test, y_test):
    """
    Genera las métricas y matrices de confusión y las guarda en 'files/output/metrics.json'.
    """
    if estimator is None:
        raise ValueError("El modelo no se pudo cargar. Asegúrate de haber guardado el modelo correctamente.")
    
    # Predicciones
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

# Calcular métricas para la clase minoritaria (clase 1)
    metrics_train = {
    "type": "metrics",
    "dataset": "train",
    "precision": precision_score(y_train, y_train_pred, average='micro'),
    "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
    "recall": recall_score(y_train, y_train_pred, average='micro'),
    "f1_score": f1_score(y_train, y_train_pred, average='micro')
    }

    metrics_test = {
    "type": "metrics",
    "dataset": "test",
    "precision": precision_score(y_test, y_test_pred, average='micro'),
    "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
    "recall": recall_score(y_test, y_test_pred, average='micro'),
    "f1_score": f1_score(y_test, y_test_pred, average='micro')
    }

    # Calcular matrices de confusión
    def cm_to_dict(cm, dataset_name):
        return {
            "type": "cm_matrix",
            "dataset": dataset_name,
            "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
            "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
        }
    
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)
    
    cm_train_dict = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
        "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
    }
    cm_test_dict = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
        "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
    }

    # Guardar métricas y matrices en un archivo JSON
    output_directory = os.path.abspath("../files/output")
    output_file = "metrics.json"
    output_path = os.path.join(output_directory, output_file)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    try:
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(json.dumps(metrics_train) + "\n")
            file.write(json.dumps(metrics_test) + "\n")
            file.write(json.dumps(cm_train_dict) + "\n")
            file.write(json.dumps(cm_test_dict) + "\n")
        print(f"Métricas y matrices de confusión guardadas correctamente en: {output_path}")
    except Exception as e:
        print(f"Error al guardar las métricas y matrices de confusión: {e}")
    
    # Imprimir el reporte de clasificación para verificación
    print("Reporte de clasificación (entrenamiento):")
    print(classification_report(y_train, y_train_pred))
    print("\nReporte de clasificación (prueba):")
    print(classification_report(y_test, y_test_pred))


# Ejemplo de uso
estimator = load_estimator()
save_metrics_and_confusion_matrices(estimator, x_train, y_train, x_test, y_test)

Métricas y matrices de confusión guardadas correctamente en: d:\ever\PhD\Predictiva\LAB_01_prediccion_del_default_usando_bosques_aleatorios\files\output\metrics.json
Reporte de clasificación (entrenamiento):
              precision    recall  f1-score   support

           0       0.97      0.92      0.95     15925
           1       0.77      0.91      0.83      4703

    accuracy                           0.92     20628
   macro avg       0.87      0.91      0.89     20628
weighted avg       0.93      0.92      0.92     20628


Reporte de clasificación (prueba):
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      6955
           1       0.53      0.53      0.53      1895

    accuracy                           0.80      8850
   macro avg       0.70      0.70      0.70      8850
weighted avg       0.80      0.80      0.80      8850

