In [1]:
#Carga de datos

def load_data(file_path):
    import pandas as pd
    import os
    
    data= pd.read_csv(file_path, index_col=False, compression="zip")
 
    return data

In [2]:
#Limpieza de datos

def clean_data(data):
    import numpy as np
    df = data.copy()
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns="ID", inplace=True)

    df = df[(df["EDUCATION"]!=0) & (df["MARRIAGE"]!=0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x>4 else x)

    return df


In [3]:
#Split de datos

def data_split(data):
    x = data.drop(columns="default")
    y = data["default"]

    return x, y


In [4]:
#Creación de pipeline

def make_pipeline(estimator):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import balanced_accuracy_score
    categorical_feature=['EDUCATION','SEX','MARRIAGE']

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature)
        ],
        remainder='passthrough'
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('estimator', estimator)
    ],
    verbose=False)

    return pipeline


In [5]:
#Grid search

def make_grid_search(estimator, param_grid, cv=10):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
        n_jobs=-1,
        verbose=2
    )

    return grid_search 


In [6]:
#Guardado del estimador comprimido

def save_estimator_compressed(estimator, file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle

    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    with gzip.open(file_path, "wb") as file:
        pickle.dump(estimator, file)


In [7]:
#Cargo del estimador comprimido

def load_estimator_compressed(file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"El archivo {file_path} no se encuentra.")
        
        with gzip.open(file_path, "rb") as file:
            estimator = pickle.load(file)
        
        return estimator

    except Exception as e:
        print(f"Error al cargar el modelo: {e}")
        return None


In [8]:
#Métricas de evaluación

def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2

In [9]:
#Reporte de métricas

def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"   R2: {r2}")

In [10]:
#Chequeo del estimador

def check_estimator():

    import pickle

    import pandas as pd
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    data, target = load_data()

    estimator = load_estimator_compressed()

    mse, mae, r2 = eval_metrics(
        y_test_true,
        estimator.predict(x_test),
    )

    report(estimator.best_estimator_, mse, mae, r2)


In [11]:
#Cálculo y guardado de métricas

def calculate_and_save_metrics(model, x_train, x_test, y_train, y_test):
    from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    
    )
    import json
    import os
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        }
    ]

    os.makedirs("../files/output", exist_ok=True)
    with open("../files/output/metrics.json", "w") as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')


In [12]:
#Cálculo y guardado de matrices de confusión

def calculate_and_save_confusion_matrices(model, x_train, x_test, y_train, y_test):
    import json
    from sklearn.metrics import confusion_matrix

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    matrices = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    with open("../files/output/metrics.json", "a") as f:
        for matrix in matrices:
            f.write(json.dumps(matrix) + '\n')


In [13]:
#Ejecución del programa 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

#Paso 1: Carga y limpieza de datos

#train
data_train = load_data("../files/input/train_data.csv.zip")
data_train = clean_data(data_train)
#test
data_test = load_data("../files/input/test_data.csv.zip")
data_test = clean_data(data_test)

#Paso 2: Split de los datasets

x_train, y_train = data_split(data_train)
x_test, y_test = data_split(data_test)

#Paso 3: Creación del pipeline

pipeline = make_pipeline(
    estimator=RandomForestClassifier(random_state=42)
)


In [14]:
#Paso 4: Definición de hiperparámetros y optimización del modelo 

param_grid = {
    'estimator__n_estimators': [100],             # Solo un valor de n_estimators
    'estimator__max_depth': [None, 10],           # Reducir a dos valores
    'estimator__min_samples_split': [2],           # Solo un valor de min_samples_split
    'estimator__min_samples_leaf': [1],            # Solo un valor de min_samples_leaf
    'estimator__max_features': ['sqrt'],          # Cambiar 'auto' por 'sqrt'
    'estimator__class_weight': [None]             # Solo un valor para class_weight
}

estimator = make_grid_search(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
)

estimator.fit(x_train, y_train)

best_estimator = load_estimator_compressed()


if best_estimator is not None:

    saved_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=best_estimator.predict(x_test)
    )

    current_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=estimator.predict(x_test)
    )

    if current_balanced_accuracy < saved_balanced_accuracy:
        estimator = best_estimator


# Paso 5: Guardado del modelo
save_estimator_compressed(estimator)


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Error al cargar el modelo: El archivo ../files/models/model.pkl.gz no se encuentra.


In [15]:
#Paso 6: Cálculo y guardado de métricas
calculate_and_save_metrics(estimator, x_train, x_test, y_train, y_test)

#Paso 7: Cálculo y guardado de matrices de confusión
calculate_and_save_confusion_matrices(estimator, x_train, x_test, y_train, y_test)

