In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import os
import json

Limpieza de datos

In [2]:
df_test = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",
)
df_test.rename(columns={'default payment next month': 'default'}, inplace=True)
df_test.drop(columns=['ID'], inplace=True)
df_test.dropna(inplace=True)
df_test= df_test.loc[df_test["MARRIAGE"] != 0]
df_test= df_test.loc[df_test["EDUCATION"] != 0]
df_test['EDUCATION'] = np.where(df_test['EDUCATION'] > 4, 'others', df_test['EDUCATION'])
df_test

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
2,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,250000,1,1,2,29,0,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,50000,2,3,3,23,1,2,0,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,20000,1,2,1,44,-2,-2,-2,-2,-2,...,2882,9235,1719,2890,2720,2890,9263,1824,1701,0
8996,360000,1,1,2,35,-1,-1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
8997,150000,1,1,2,35,-1,-1,-1,-1,-1,...,780,0,0,9054,0,783,0,0,0,0
8998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1


In [3]:

df_train = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",
)
df_train.rename(columns={'default payment next month': 'default'}, inplace=True)
df_train.drop(columns=['ID'], inplace=True)
df_train.dropna(inplace=True)
df_train= df_train.loc[df_train["MARRIAGE"] != 0]
df_train= df_train.loc[df_train["EDUCATION"] != 0]
df_train['EDUCATION'] = np.where(df_train['EDUCATION'] > 4, 'others', df_train['EDUCATION'])
df_train


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,140000,2,2,1,27,2,-1,-1,-1,0,...,1580,804,728,752,800,1580,0,700,700,0
20996,130000,1,2,2,41,0,0,0,0,0,...,123107,42897,39378,4442,5200,5012,2500,5000,2000,0
20997,50000,1,3,2,23,0,0,0,0,0,...,28967,29829,30046,1973,1426,1001,1432,1062,997,0
20998,90000,2,3,2,25,0,0,0,0,0,...,5613,10113,10113,3000,3000,0,4500,0,3440,0


Dividir datos

In [4]:
# Separar características y objetivo
X_train = df_train.drop(columns=['default'])
y_train = df_train['default']

X_test = df_test.drop(columns=['default'])
y_test = df_test['default']

# Validación del tamaño de los datos
print(f"x_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"x_test: {X_test.shape}, y_test: {y_test.shape}")


x_train: (20953, 23), y_train: (20953,)
x_test: (8979, 23), y_test: (8979,)


pipeline

In [5]:

# Definir las columnas categóricas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

# Crear el transformador para one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combinar las transformaciones en un preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # Deja las demás columnas intactas
)

# Crear el modelo de Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Crear el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', random_forest)
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Evaluar el modelo
score = pipeline.score(X_test, y_test)
print(f"Accuracy del modelo en el conjunto de prueba: {score:.2f}")


Accuracy del modelo en el conjunto de prueba: 0.82


Optimice los hiperparametros

In [6]:

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'classifier__n_estimators': [50, 100, 200],         # Número de árboles
    'classifier__max_depth': [None, 10, 20, 30],       # Profundidad máxima de los árboles
    'classifier__min_samples_leaf': [1, 2, 5]          # Mínimo de muestras en una hoja
}

# Crear el grid search con validación cruzada
grid_search = GridSearchCV(
    estimator=pipeline,            # El pipeline
    param_grid=param_grid,         # Hiperparámetros a optimizar
    scoring='balanced_accuracy',   # Métrica de evaluación
    cv=10,                         # Número de splits para validación cruzada
    n_jobs=-1,                     # Paralelizar la búsqueda
    verbose=1                      # Mostrar progreso
)

# Realizar la búsqueda de hiperparámetros
grid_search.fit(X_train, y_train)

# Mostrar los mejores hiperparámetros y el desempeño asociado
print(f"Mejores hiperparámetros: {grid_search.best_params_}")
print(f"Mejor precisión balanceada: {grid_search.best_score_:.4f}")

# Evaluar el mejor modelo en el conjunto de prueba
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Precisión en el conjunto de prueba: {test_score:.4f}")


Fitting 10 folds for each of 36 candidates, totalling 360 fits
Mejores hiperparámetros: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__n_estimators': 200}
Mejor precisión balanceada: 0.6583
Precisión en el conjunto de prueba: 0.8293


pkl

In [7]:
import pickle

# Guardar el modelo optimizado
model_path = '../files/models/model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(grid_search.best_estimator_, file)

print(f"Modelo guardado en {model_path}")


Modelo guardado en ../files/models/model.pkl


In [8]:
# Cargar el modelo guardado
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Verificar que el modelo cargado funciona
loaded_model_score = loaded_model.score(X_test, y_test)
print(f"Precisión en conjunto de prueba (modelo cargado): {loaded_model_score:.4f}")


Precisión en conjunto de prueba (modelo cargado): 0.8293


In [9]:
# Crear la carpeta de salida si no existe
output_dir = '../files/output/'
os.makedirs(output_dir, exist_ok=True)

# Predecir los valores para el conjunto de entrenamiento y prueba
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calcular las métricas para el conjunto de entrenamiento
metrics_train = {
    'type':'metrics',
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}

# Calcular las métricas para el conjunto de prueba
metrics_test = {
    'type':'metrics',
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}

# Calcular la matriz de confusión para el conjunto de entrenamiento
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

# Formatear las matrices de confusión como se solicita, asegurando la conversión a int
cm_train_dict = {
    'type': 'cm_matrix',
    'dataset': 'train',
    'true_0': {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
    'true_1': {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])}
}

cm_test_dict = {
    'type': 'cm_matrix',
    'dataset': 'test',
    'true_0': {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
    'true_1': {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])}
}

# Guardar las métricas en un archivo JSON
metrics = [metrics_train, metrics_test, cm_train_dict, cm_test_dict ]
output_path = os.path.join(output_dir, 'metrics.json')


with open(output_path, 'w') as f:
    json.dump(metrics, f, indent=4)

print(f"Métricas guardadas en {output_path}")


Métricas guardadas en ../files/output/metrics.json
