In [1]:
def load_data(file_path):
    import pandas as pd
    import os

    # Verifica si los archivos existen antes de leerlos
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"El archivo de prueba no se encuentra: {file_path}")
    
    # Cargar los archivos CSV comprimidos
    data= pd.read_csv(file_path, index_col=False, compression="zip")
 
    return data

In [2]:
def clean_data(data):
    import numpy as np
    df = data.copy()
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns="ID", inplace=True)

    df = df[(df["EDUCATION"]!=0) & (df["MARRIAGE"]!=0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x>4 else x)

    return df

In [3]:
def data_split(data):
    x = data.drop(columns="default")
    y = data["default"]

    return x, y

In [4]:
def make_pipeline(estimator):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import balanced_accuracy_score
    categorical_feature=['EDUCATION','SEX','MARRIAGE']


    # Crear el transformador para las columnas categóricas
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_feature)
        ],
        remainder='passthrough'   # Asume que todas las columnas restantes son numéricas
    )

    # Crear el pipeline con preprocesamiento y el modelo
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('estimator', estimator)  # Establecer el estimador que se pasa como argumento
    ],
    verbose=False)

    return pipeline

In [5]:
def make_grid_search(estimator, param_grid, cv=10):

    from sklearn.model_selection import GridSearchCV

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        scoring="balanced_accuracy",
        n_jobs=-1,
        verbose=2

    )

    return grid_search 

In [6]:
def save_estimator_compressed(estimator, file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle
    # Asegurarse de que el directorio exista
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Guardar el modelo comprimido
    with gzip.open(file_path, "wb") as file:
        pickle.dump(estimator, file)

In [7]:
def load_estimator_compressed(file_path="../files/models/model.pkl.gz"):
    import os
    import gzip
    import pickle
    try:
        # Asegurarse de que el directorio exista
        # os.makedirs(os.path.dirname(file_path), exist_ok=True)
        
        # Verificar si el archivo existe antes de intentar abrirlo
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"El archivo {file_path} no se encuentra.")
        
        # Abrir el archivo comprimido en modo de lectura binaria
        with gzip.open(file_path, "rb") as file:
            estimator = pickle.load(file)
        
        return estimator

    except Exception as e:
        print(f"Ocurrió un error al cargar el modelo: {e}")
        return None

In [8]:
def eval_metrics(y_true, y_pred):

    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    return mse, mae, r2

In [9]:
def report(estimator, mse, mae, r2):

    print(estimator, ":", sep="")
    print(f"  MSE: {mse}")
    print(f"  MAE: {mae}")
    print(f"   R2: {r2}")

In [10]:
def check_estimator():

    import pickle

    import pandas as pd
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

    data, target = load_data()


    estimator = load_estimator_compressed()

    mse, mae, r2 = eval_metrics(
        y_test_true,
        estimator.predict(x_test),
    )

    report(estimator.best_estimator_, mse, mae, r2)

In [11]:
def calculate_and_save_metrics(model, x_train, x_test, y_train, y_test):
    from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    
    )
    import json
    import os
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        }
    ]

    os.makedirs("../files/output", exist_ok=True)
    with open("../files/output/metrics.json", "w") as f:
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

In [12]:
def calculate_and_save_confusion_matrices(model, x_train, x_test, y_train, y_test):
    import json
    from sklearn.metrics import confusion_matrix

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    matrices = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    with open("../files/output/metrics.json", "a") as f:
        for matrix in matrices:
            f.write(json.dumps(matrix) + '\n')

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
# 1. Cargar los datos y limpiar
#train
data_train = load_data("../files/input/train_data.csv.zip")
data_train = clean_data(data_train)
#test
data_test = load_data("../files/input/test_data.csv.zip")
data_test = clean_data(data_test)

# 2. Dividir los datasets
x_train, y_train = data_split(data_train)
x_test, y_test = data_split(data_test)

# 3. Pipeline
pipeline = make_pipeline(
    estimator=RandomForestClassifier(random_state=42)
)

In [14]:
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('estimator', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
# 4. Definir los hiperparámetros para la búsqueda en cuadrícula
param_grid = {
    'estimator__n_estimators': [100],             # Solo un valor de n_estimators
    'estimator__max_depth': [None, 10],           # Reducir a dos valores
    'estimator__min_samples_split': [2],           # Solo un valor de min_samples_split
    'estimator__min_samples_leaf': [1],            # Solo un valor de min_samples_leaf
    'estimator__max_features': ['sqrt'],          # Cambiar 'auto' por 'sqrt'
    'estimator__class_weight': [None]             # Solo un valor para class_weight
}


# Crear el objeto GridSearchCV
estimator = make_grid_search(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    
)

# Ajustar el modelo a los datos de entrenamiento
estimator.fit(x_train, y_train)

Fitting 10 folds for each of 2 candidates, totalling 20 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'estimator__class_weight': [None], 'estimator__max_depth': [None, 10], 'estimator__max_features': ['sqrt'], 'estimator__min_samples_leaf': [1], ...}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
# Obtener el mejor estimador
best_estimator = load_estimator_compressed()

if best_estimator is not None:

    saved_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=best_estimator.predict(x_test)
    )

    current_balanced_accuracy = balanced_accuracy_score(
        y_true=y_test, y_pred=estimator.predict(x_test)
    )

    if current_balanced_accuracy < saved_balanced_accuracy:
        estimator = best_estimator

# 5. Guardar el modelo
save_estimator_compressed(estimator)

Ocurrió un error al cargar el modelo: El archivo ../files/models/model.pkl.gz no se encuentra.


In [17]:
# 5. Calculo de métricas 
calculate_and_save_metrics(estimator, x_train, x_test, y_train, y_test)

# 6. Calculo de matrices
calculate_and_save_confusion_matrices(estimator, x_train, x_test, y_train, y_test)

In [18]:
import pandas as pd
# Lee el archivo JSON con múltiples objetos en líneas separadas
metrica = pd.read_json("../files/output/metrics.json", lines=True)
metrica

Unnamed: 0,type,dataset,precision,balanced_accuracy,recall,f1_score,true_0,true_1
0,metrics,train,0.999576,0.998669,0.99746,0.998517,,
1,metrics,test,0.649416,0.674402,0.408185,0.501289,,
2,cm_matrix,train,,,,,"{'predicted_0': 16226, 'predicted_1': 2}","{'predicted_0': 12, 'predicted_1': 4713}"
3,cm_matrix,test,,,,,"{'predicted_0': 6653, 'predicted_1': 420}","{'predicted_0': 1128, 'predicted_1': 778}"
