# Paso 1: Carga y limpieza
### Objetivos:
- Renombrar columna default.
- Quitar ID.
- Agrupar EDUCATION > 4 en Others.
- Quitar información faltante.

In [183]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

# Leemos dataframe para entrenamiento y para test (t)

dataframe = pd.read_csv("../files/input/train_data.csv.zip",index_col=False, compression="zip")
dataframet = pd.read_csv("../files/input/test_data.csv.zip",index_col=False, compression="zip")

dataframet.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [184]:
# Quitamos el ID y renombramos la columna default (target)

dataframe.pop('ID')
dataframet.pop('ID')
dataframe.rename(columns={'default payment next month':'default'})
dataframet.rename(columns={'default payment next month':'default'})

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
2,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,250000,1,1,2,29,0,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,50000,2,3,3,23,1,2,0,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,20000,1,2,1,44,-2,-2,-2,-2,-2,...,2882,9235,1719,2890,2720,2890,9263,1824,1701,0
8996,360000,1,1,2,35,-1,-1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
8997,150000,1,1,2,35,-1,-1,-1,-1,-1,...,780,0,0,9054,0,783,0,0,0,0
8998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1


In [185]:
# Vemos todos los niveles de EDUCATION que hay

dataframe['EDUCATION'].unique()

array([3, 2, 1, 5, 4, 6, 0])

In [186]:
# Agrupamos todos los mayores de 4 en 3 (Others)

def normalizeEdu(x):
    if x < 5:
        return x
    else:
        return 4

dataframe["EDUCATION"] = dataframe["EDUCATION"].apply(lambda x: normalizeEdu(x))
dataframet["EDUCATION"] = dataframet["EDUCATION"].apply(lambda x: normalizeEdu(x))

In [187]:
# Verificamos que no hay valores nulos

np.unique(dataframet.isnull().values)

array([False])

In [188]:
#categorical_features = dataframe.select_dtypes(include=["object","category"]).columns
#categorical_features

In [189]:
#dataframe["SEX"] = dataframe["SEX"].astype('category',copy=False)
#dataframe["MARRIAGE"] = dataframe["MARRIAGE"].astype('category', copy=False)
#dataframe["EDUCATION"] = dataframe["MARRIAGE"].astype('category', copy=False)
#dataframet["SEX"] = dataframet["SEX"].astype('category',copy=False)
#dataframet["MARRIAGE"] = dataframet["MARRIAGE"].astype('category', copy=False)
#dataframet["EDUCATION"] = dataframe["MARRIAGE"].astype('category', copy=False)
#dataframet.dtypes

# Paso 2
Dividir datasets en entrenamiento y prueba con X_train, y_train, X_test, y y_test.

In [190]:
# División de features y target

X_train = dataframe.iloc[:, :-1]
y_train = dataframe.iloc[:, -1:]
X_test = dataframet.iloc[:, :-1]
y_test = dataframet.iloc[:, -1:]
X_train.shape

(21000, 23)

# Paso 3
Crear el pipeline con las dos capas: OneHotEncoder y RandomForestClassifier.

In [191]:
from sklearn.ensemble import RandomForestClassifier # type: ignore
from sklearn.preprocessing import OneHotEncoder # type: ignore
from sklearn.compose import ColumnTransformer # type: ignore
from sklearn.pipeline import Pipeline # type: ignore
from sklearn.model_selection import cross_val_score # type: ignore

ohe = OneHotEncoder()
rf = RandomForestClassifier()

preprocessor = ColumnTransformer(transformers=[("encoder", ohe, ["SEX", "MARRIAGE", "EDUCATION"])],remainder="passthrough")

pipe = Pipeline([("preprocessor", preprocessor),
                ("random forest",rf)])

# Paso 4
Definición y optimización de hiperparámetros.

In [192]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'random forest__n_estimators': [150,250],
    'random forest__max_depth': [150, 200, 250],
    'random forest__min_samples_split': [3, 5],
    'random forest__min_samples_leaf': [1, 3],
    'random forest__max_features': ['auto','sqrt']
}



# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=10,  # 10 particiones para la validación cruzada
    scoring='balanced_accuracy',  # Usar precisión balanceada
    n_jobs=-1  # Usar todos los núcleos disponibles
)

# Entrenar el modelo
grid_search.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [193]:
from sklearn.metrics import balanced_accuracy_score

# Obtener el mejor modelo
best_model = grid_search   #.best_estimator_

# Evaluar en el conjunto de prueba
y_pred = best_model.predict(X_test)
test_score = balanced_accuracy_score(y_test, y_pred)
str(type(best_model))

"<class 'sklearn.model_selection._search.GridSearchCV'>"

# Paso 5
Guardar el modelo.

In [194]:
import os
import pickle

# Crear las carpetas necesarias si no existen
os.makedirs('../files/models', exist_ok=True)

# Ruta del archivo donde se guardará el modelo
model_path = '../files/models/model.pkl'

# Guardar el modelo usando pickle
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

os.path.getsize(model_path)

# 54736115
# 95376431
# 87332271
# 25177770

45808819

# Paso 6
Cálculo de métricas y guardado JSON.

In [195]:
from sklearn.metrics import (
    precision_score,
    balanced_accuracy_score,
    recall_score,
    f1_score
)
import json


# Hacer predicciones en los conjuntos de entrenamiento y prueba
# Predecir en el conjunto de entrenamiento
y_train_pred = best_model.predict(X_train)

# Predecir en el conjunto de prueba
y_test_pred = best_model.predict(X_test)

# Calcular las métricas para cada conjunto
def round_float(value):
    return round(value, 3)

metrics_train = {
    'type': 'metrics',
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred, average='binary'),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred, average='binary'),
    'f1_score': f1_score(y_train, y_train_pred, average='binary')
}

metrics_test = {
    'type': 'metrics',
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred, average='binary'),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred, average='binary'),
    'f1_score': f1_score(y_test, y_test_pred, average='binary')
}

# Guardar las métricas en 'files/output/metrics.json'
# Crear el directorio si no existe
os.makedirs('../files/output', exist_ok=True)

metrics_path = '../files/output/metrics.json'

# Escribir las métricas en el archivo
with open(metrics_path, 'w') as f:
    json.dump([metrics_train], f)
    f.write('\n')
    json.dump([metrics_test], f)
    f.write('\n')

# Paso 7
Matrices de confusión

In [196]:
from sklearn.metrics import confusion_matrix


# Calcular las matrices de confusión
def format_confusion_matrix(cm, dataset_type):
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_type,
        'true_0': {
            'predicted_0': int(cm[0][0]) if cm[0][0] != 0 else None,
            'predicted_1': int(cm[0][1]) if cm[0][1] != 0 else None
        },
        'true_1': {
            'predicted_0': int(cm[1][0]) if cm[1][0] != 0 else None,
            'predicted_1': int(cm[1][1]) if cm[1][1] != 0 else None
        }
    }
    return cm_dict

cm_train = confusion_matrix(y_train, y_train_pred, labels=[0,1])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0,1])

cm_train_formatted = format_confusion_matrix(cm_train, 'train')
cm_test_formatted = format_confusion_matrix(cm_test, 'test')

# Crear la lista de métricas y matrices de confusión sin duplicados
metrics_data = [metrics_train, metrics_test, cm_train_formatted, cm_test_formatted]

# Guardar el contenido en 'metrics.json' sin caracteres de nueva línea
os.makedirs('../files/output', exist_ok=True)
metrics_path = '../files/output/metrics.json'

with open(metrics_path, 'w') as f:
    json.dump(metrics_train, f)
    f.write('\n')
    json.dump(metrics_test, f)
    f.write('\n')
    json.dump(cm_train_formatted, f, separators=(',', ':'), ensure_ascii=False)
    f.write("\n")
    json.dump(cm_test_formatted, f, separators=(',', ':'), ensure_ascii=False)