In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import gzip
import pickle
import json
import os
import zipfile

In [5]:
# Cargar los datos
train_df = pd.read_csv('../files/input/train_data.csv/train_default_of_credit_card_clients.csv')
test_df = pd.read_csv('../files/input/test_data.csv/test_default_of_credit_card_clients.csv')

# Renombrar la columna "default payment next month" a "default"
train_df.rename(columns={'default payment next month': 'default'}, inplace=True)
test_df.rename(columns={'default payment next month': 'default'}, inplace=True)

# Remover la columna "ID"
train_df.drop(columns=['ID'], inplace=True)
test_df.drop(columns=['ID'], inplace=True)

# Eliminar registros con información no disponible
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Agrupar valores de EDUCATION > 4 en la categoría "others"
train_df['EDUCATION'] = train_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_df['EDUCATION'] = test_df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

In [6]:
# Dividir los datos en características (X) y etiquetas (y)
x_train = train_df.drop(columns=['default'])
y_train = train_df['default']
x_test = test_df.drop(columns=['default'])
y_test = test_df['default']

In [7]:
# Crear un pipeline para el modelo de clasificación
# Definir las columnas categóricas
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

# Crear un transformador para las variables categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough')

# Crear el pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [66]:
# Definir los hiperparámetros a optimizar
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Realizar la búsqueda de hiperparámetros
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='balanced_accuracy')
grid_search.fit(x_train, y_train)

# Mejor modelo encontrado
best_model = grid_search



In [67]:
print(best_model)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('cat',
                                                                         OneHotEncoder(),
                                                                         ['SEX',
                                                                          'EDUCATION',
                                                                          'MARRIAGE'])])),
                                       ('classifier',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'classifier__max_depth': [None, 10, 20],
                         'classifier__min_samples_leaf': [1, 2],
                         'classifier__min_samples_split': [2, 5],
                         'classifier__n_estimators': [100, 200]},
           

In [68]:
# Guardar el modelo comprimido
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(best_model, f)

In [80]:

# Predecir en los conjuntos de entrenamiento y prueba
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

# Calcular las métricas
train_metrics = {
    'type': 'metrics',
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}

test_metrics = {
    'type': 'metrics',
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}
# Calcular las matrices de confusión
train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

# Convertir las matrices de confusión a diccionarios y asegurarse de que los valores sean int
train_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'train',
    'true_0': {'predicted_0': int(train_cm[0, 0]), 'predicted_1': int(train_cm[0, 1])},
    'true_1': {'predicted_0': int(train_cm[1, 0]), 'predicted_1': int(train_cm[1, 1])}
}

test_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'test',
    'true_0': {'predicted_0': int(test_cm[0, 0]), 'predicted_1': int(test_cm[0, 1])},
    'true_1': {'predicted_0': int(test_cm[1, 0]), 'predicted_1': int(test_cm[1, 1])}
}

with open("../files/output/metrics.json", "w") as f:
    json.dump([train_metrics, test_metrics, train_cm_dict, test_cm_dict], f)



In [72]:
# Ruta al archivo JSON
output_file = '../files/output/metrics.json'

# Crear la carpeta si no existe
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Leer el contenido actual del archivo JSON (si existe)
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
    with open(output_file, 'r') as f:
        try:
            data = json.load(f)  # Cargar el contenido del archivo como una lista
        except json.JSONDecodeError:
            # Si el archivo está corrupto o vacío, inicializar una lista vacía
            data = []
else:
    data = []

# Agregar los nuevos datos a la lista
data.append(train_cm_dict)
data.append(test_cm_dict)

# Escribir el archivo JSON completo
with open(output_file, 'w') as f:
    json.dump(data, f, indent=4)

In [73]:
def _load_metrics():
    with open("../files/output/metrics.json", "r", encoding="utf-8") as file:
        data = file.read().strip()  # Elimina espacios en blanco y saltos de línea al inicio y final
        return json.loads(data)

data = _load_metrics()
data

[{'type': 'metrics',
  'dataset': 'train',
  'precision': 0.9931614939505523,
  'balanced_accuracy': 0.8986087888414297,
  'recall': 0.7988153162682462,
  'f1_score': 0.8854496423965295},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.6681146828844483,
  'balanced_accuracy': 0.6744788009561253,
  'recall': 0.4028287061288633,
  'f1_score': 0.5026143790849673},
 {'type': 'cm_matrix',
  'dataset': 'train',
  'true_0': {'predicted_0': 16247, 'predicted_1': 26},
  'true_1': {'predicted_0': 951, 'predicted_1': 3776}},
 {'type': 'cm_matrix',
  'dataset': 'test',
  'true_0': {'predicted_0': 6709, 'predicted_1': 382},
  'true_1': {'predicted_0': 1140, 'predicted_1': 769}}]

In [74]:
with open('../files/output/metrics.json', 'w') as f:
    json.dump(data, f)