In [24]:
import os
import zipfile

import pandas as pd

# importar one-hot-encoding
from sklearn.preprocessing import OneHotEncoder

# importar column transformer
from sklearn.compose import ColumnTransformer

# importar modelo de bosques aleatorios
from sklearn.ensemble import RandomForestClassifier

# importar pipeline
from sklearn.pipeline import Pipeline

# importar optimizador de hiperparametros
from sklearn.model_selection import GridSearchCV

# importar metricas
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

# importar matriz de confusion
from sklearn.metrics import confusion_matrix

# importar libreria para guardar el modelo
import gzip
import pickle

# importar libreria para guardar las metricas
import json

In [25]:
train = pd.read_csv('files/input/train_data.csv/train_default_of_credit_card_clients.csv')
test = pd.read_csv('files/input/test_data.csv/test_default_of_credit_card_clients.csv')
train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [26]:
 # Renombrar la columna "default payment next month" a "default"
train.rename(columns={'default payment next month': 'default'}, inplace=True)
test.rename(columns={'default payment next month': 'default'}, inplace=True)

# Eliminar la columna "ID"
train.drop(columns='ID', inplace=True)
test.drop(columns='ID', inplace=True)

train = train[(train["EDUCATION"]!=0) & (train["MARRIAGE"]!=0)]
test = test[(test["EDUCATION"]!=0) & (test["MARRIAGE"]!=0)]

# Eliminar los registros con informacion no disponible
train.dropna(inplace=True)
test.dropna(inplace=True)

# Agrupar los valores de EDUCATION > 4 en la categoria "others"
train.loc[train['EDUCATION'] > 4, 'EDUCATION'] = 4
test.loc[test['EDUCATION'] > 4, 'EDUCATION'] = 4

In [27]:
test

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
2,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,250000,1,1,2,29,0,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,50000,2,3,3,23,1,2,0,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,20000,1,2,1,44,-2,-2,-2,-2,-2,...,2882,9235,1719,2890,2720,2890,9263,1824,1701,0
8996,360000,1,1,2,35,-1,-1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
8997,150000,1,1,2,35,-1,-1,-1,-1,-1,...,780,0,0,9054,0,783,0,0,0,0
8998,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1


In [28]:
x_train = train.drop(columns='default')
y_train = train['default']
x_test = test.drop(columns='default')
y_test = test['default']

categorical_features = x_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [29]:
# Crear un pipeline para el modelo de clasificacion
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [30]:
def optimizar_hiperparametros(pipeline, x_train, y_train):
        param_grid = {
            'classifier__n_estimators': [200],
            'classifier__max_depth': [None],
            'classifier__min_samples_split': [10],
            'classifier__min_samples_leaf': [1, 2],
            'classifier__max_features': ['sqrt'],

        }

        grid_search = GridSearchCV(
            estimator=pipeline,           
            param_grid=param_grid,       
            cv=10,                       
            scoring='balanced_accuracy',  
            n_jobs=-1,
            refit=True                     
        )

        
        return grid_search

grid = optimizar_hiperparametros(pipeline, x_train, y_train)
grid.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as a different type depending on the ColumnTransformer inputs.



In [None]:
# Guardar el modelo
if not os.path.exists('/files/models'):
    os.makedirs('/files/models')

with gzip.open('./files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid, f)

In [44]:
# Calcular las metricas de precision, precision balanceada
# recall, y f1-score para los conjuntos de entrenamiento y prueba
y_train_pred = grid.predict(x_train)
y_test_pred = grid.predict(x_test)

def metrics_calc(y_true, y_pred, dataset):
        return {
            'type': 'metrics',
            'dataset': dataset,
            'precision': precision_score(y_true, y_pred),
            'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1_score': f1_score(y_true, y_pred)
        }

def matrix_calc(y_true, y_pred, dataset):
    cm = confusion_matrix(y_true, y_pred)
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }
metrics = [
    metrics_calc(y_train, y_train_pred, 'train'),
    metrics_calc(y_test, y_test_pred, 'test'),
    matrix_calc(y_train, y_train_pred, 'train'),
    matrix_calc(y_test, y_test_pred, 'test')
]
with open("files/output/metrics.json", "w") as f:
    for metric in metrics:
        f.write(json.dumps(metric) + "\n")