In [116]:
import pandas as pd
import numpy as np
import pickle

df1 = pd.read_csv("../files/input/train_data.csv.zip", index_col=False, compression="zip")
df1_ = df1.copy()
df2 = pd.read_csv("../files/input/test_data.csv.zip", index_col=False, compression="zip")
df2_=df2.copy()


In [117]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - los valores 0 en education deben eliminarse
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".

def preprocess_data(df):
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.dropna(inplace=True)
    df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4
    df.loc[df["EDUCATION"] == 0, "EDUCATION"] = 4
    df = df.loc[df['MARRIAGE'] != 0]
    return df

df1_ = preprocess_data(df1_)
df2_ = preprocess_data(df2_)
df2_['EDUCATION'].unique()

array([2, 3, 1, 4])

In [118]:
# Paso 2. cree una funcion de la siguiente forma:
# Divida los datasets en x_train, y_train, x_test, y_test.
# Donde: df1_ es tran y df2_ es test
# y_train y y_test son las columnas "default"
# x_train y x_test son las columnas restantes

def make_train_test_split(train_data, test_data, target_column):
    x_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    x_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]
    return x_train, y_train, x_test, y_test

In [119]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding, recordar que las variables categoricas son: SEX, MARRIAGE, EDUCATION.
# - Ajusta un modelo de bosques aleatorios (rando forest).
# - Ajuste el modelo al dataset de entrenamiento.



def make_pipeline(estimator):
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer
    from sklearn.feature_selection import SelectKBest, f_classif


    categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    for column in categorical_features:
        df1_[column] = df1_[column].astype(str)
        df2_[column] = df2_[column].astype(str)
    transformer = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough')
    selectkbest = SelectKBest(score_func=f_classif)
    pipeline = Pipeline(
        steps=[
            ("tranformer", transformer),
            ("selectkbest", selectkbest),
            ("estimator", estimator),
        ],
        verbose=False,
    )
    return pipeline


In [120]:
def make_grid_search(estimator, param_grid, cv=10):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring='balanced_accuracy',
        n_jobs=-1
    )

    return grid_search

In [121]:
def save_estimator(estimator):
    import os
    import gzip
    import pickle

    # Crear la carpeta si no existe
    os.makedirs("../files/models", exist_ok=True)

    # Guardar el modelo con compresión gzip
    with gzip.open("../files/models/model.pkl.gz", "wb") as file:
        pickle.dump(estimator, file)

In [122]:
def load_estimator():
    import os
    import gzip
    import pickle

    # Verificar si el archivo existe
    if not os.path.exists("../files/models/model.pkl.gz"):
        return None

    # Cargar el modelo desde un archivo comprimido
    with gzip.open("../files/models/model.pkl.gz", "rb") as file:
        estimator = pickle.load(file)

    return estimator

In [123]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.
# Guarde el modelo en el archivo model.pkl.gz

def train_RandomForestClassifier():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import GridSearchCV

    x_train, y_train, x_test, y_test = make_train_test_split(df1_, df2_, "default")
    pipeline = make_pipeline(estimator=RandomForestClassifier(random_state=42, class_weight="balanced"),)
    estimator = make_grid_search(
        estimator=pipeline, 
        param_grid={
        "estimator__n_estimators": [200], 
        "estimator__max_depth": [5]}, 
        cv=10)
    estimator.fit(x_train, y_train)
    save_estimator(estimator)
    #mostrar los mejores hipermarametros
    print(estimator.best_params_)

train_RandomForestClassifier()

{'estimator__max_depth': 5, 'estimator__n_estimators': 200}


In [124]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

def calculate_metrics():
    from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

    x_train, x_test, y_train, y_test = make_train_test_split(df1_, df2_, "default")
    estimator = load_estimator()
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

    metrics_train = {
        "dataset": "train",
        "precision": precision_score(y_train, y_train_pred),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "recall": recall_score(y_train, y_train_pred),
        "f1_score": f1_score(y_train, y_train_pred),
    }
    metrics_test = {
        "dataset": "test",
        "precision": precision_score(y_test, y_test_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "recall": recall_score(y_test, y_test_pred),
        "f1_score": f1_score(y_test, y_test_pred),
    }

    return [metrics_train, metrics_test]


In [125]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
'''
def calculate_confusion_matrix():
    from sklearn.metrics import confusion_matrix

    x_train, x_test, y_train, y_test = make_train_test_split(df1_, df2_, "default")
    estimator = load_estimator()
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    cm_matrix_train = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": cm_train[0, 0], "predicted_1": cm_train[0, 1]},
        "true_1": {"predicted_0": cm_train[1, 0], "predicted_1": cm_train[1, 1]},
    }
    cm_matrix_test = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": cm_test[0, 0], "predicted_1": cm_test[0, 1]},
        "true_1": {"predicted_0": cm_test[1, 0], "predicted_1": cm_test[1, 1]},
    }

    return [cm_matrix_train, cm_matrix_test]'''

'\ndef calculate_confusion_matrix():\n    from sklearn.metrics import confusion_matrix\n\n    x_train, x_test, y_train, y_test = make_train_test_split(df1_, df2_, "default")\n    estimator = load_estimator()\n    y_train_pred = estimator.predict(x_train)\n    y_test_pred = estimator.predict(x_test)\n\n    cm_train = confusion_matrix(y_train, y_train_pred)\n    cm_test = confusion_matrix(y_test, y_test_pred)\n\n    cm_matrix_train = {\n        "type": "cm_matrix",\n        "dataset": "train",\n        "true_0": {"predicted_0": cm_train[0, 0], "predicted_1": cm_train[0, 1]},\n        "true_1": {"predicted_0": cm_train[1, 0], "predicted_1": cm_train[1, 1]},\n    }\n    cm_matrix_test = {\n        "type": "cm_matrix",\n        "dataset": "test",\n        "true_0": {"predicted_0": cm_test[0, 0], "predicted_1": cm_test[0, 1]},\n        "true_1": {"predicted_0": cm_test[1, 0], "predicted_1": cm_test[1, 1]},\n    }\n\n    return [cm_matrix_train, cm_matrix_test]'

In [129]:
def save_metrics_and_confusion_matrices(estimator, x_train, y_train, x_test, y_test):
    import os
    import json
    from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix

    print("Directorio de trabajo actual:", os.getcwd())  # Depuración para verificar el directorio actual

    if estimator is None:
        raise ValueError("El modelo no se pudo cargar. Asegúrate de haber guardado el modelo correctamente.")

    # Predicciones
    y_train_pred = estimator.predict(x_train)
    y_test_pred = estimator.predict(x_test)

    # Calcular métricas
    metrics_train = {
        "type": "metrics",
        "dataset": "train",
        "precision": precision_score(y_train, y_train_pred, average='binary', pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_train, y_train_pred),
        "recall": recall_score(y_train, y_train_pred, average='binary', pos_label=0),
        "f1_score": f1_score(y_train, y_train_pred, average='binary', pos_label=0),
    }
    metrics_test = {
        "type": "metrics",
        "dataset": "test",
        "precision": precision_score(y_test, y_test_pred, average='binary', pos_label=0),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_test_pred),
        "recall": recall_score(y_test, y_test_pred, average='binary', pos_label=0),
        "f1_score": f1_score(y_test, y_test_pred, average='binary', pos_label=0),
    }

    # Calcular matrices de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    cm_train_dict = {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
        "true_1": {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])},
    }
    cm_test_dict = {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
        "true_1": {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])},
    }

    # Crear la carpeta output en ../files si no existe
    output_directory = "../files/output"
    os.makedirs(output_directory, exist_ok=True)

    # Guardar métricas en un archivo JSON
    output_path = os.path.join(output_directory, "metrics.json")

    try:
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(json.dumps(metrics_train) + "\n")
            file.write(json.dumps(metrics_test) + "\n")
            file.write(json.dumps(cm_train_dict) + "\n")
            file.write(json.dumps(cm_test_dict) + "\n")
        print(f"Métricas y matrices de confusión guardadas correctamente en: {output_path}")
    except Exception as e:
        print(f"Error al guardar el archivo JSON: {e}")
        raise

    # Reportes de clasificación
    print("Reporte de clasificación (entrenamiento):")
    print(classification_report(y_train, y_train_pred))
    print("\nReporte de clasificación (prueba):")
    print(classification_report(y_test, y_test_pred))

