In [1]:
import pandas as pd
import numpy as np
import gzip
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
import zipfile
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    confusion_matrix,
    accuracy_score
)
import json

In [2]:
ruta_test = "../files/input/test_data.csv.zip"
ruta_train = "../files/input/train_data.csv.zip"

with zipfile.ZipFile(ruta_train) as z:
    csv_name = [name for name in z.namelist() if name.endswith(".csv")][0]
    with z.open(csv_name) as f:
        df_train = pd.read_csv(f)


df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [3]:
with zipfile.ZipFile(ruta_test) as z:
    csv_name = [name for name in z.namelist() if name.endswith(".csv")][0]
    with z.open(csv_name) as f:
        df_test = pd.read_csv(f)


df_test.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,10,20000,1,3,2,35,-2,-2,-2,-2,...,0,13007,13912,0,0,0,13007,1122,0,0
2,11,200000,2,3,2,34,0,0,2,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,15,250000,1,1,2,29,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,16,50000,2,3,3,23,1,2,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [4]:
df_train.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [5]:
df_train.SEX.value_counts()

SEX
2    12697
1     8303
Name: count, dtype: int64

In [6]:
df_train.EDUCATION.value_counts()

EDUCATION
2    9759
1    7480
3    3427
5     187
4      98
6      40
0       9
Name: count, dtype: int64

In [7]:
df_train.MARRIAGE.value_counts()

MARRIAGE
2    11232
1     9505
3      225
0       38
Name: count, dtype: int64

In [8]:
# Renombramos la columna default payment
df_train = df_train.rename(columns={'default payment next month': 'default'})
df_test = df_test.rename(columns={'default payment next month': 'default'})

# Eliminar la columna ID de ambos dataframe
df_train.drop('ID', axis=1, inplace=True)
df_test.drop('ID', axis=1, inplace=True)




# Reemplazar valores en la columna EDUCATION
df_train['EDUCATION'] = df_train['EDUCATION'].replace({0: np.nan})
df_train.loc[df_train['EDUCATION'] > 4, 'EDUCATION'] = 4

df_test['EDUCATION'] = df_test['EDUCATION'].replace({0: np.nan})
df_test.loc[df_test['EDUCATION'] > 4, 'EDUCATION'] = 4

df_train['MARRIAGE'] = df_train['MARRIAGE'].replace({0: np.nan})
df_test['MARRIAGE'] = df_test['MARRIAGE'].replace({0: np.nan})


# Eliminar los NaN
df_train = df_train.dropna()
df_test = df_test.dropna()

df_train['EDUCATION'] = df_train['EDUCATION'].astype(int)
df_test['EDUCATION'] = df_test['EDUCATION'].astype(int)

df_train['MARRIAGE'] = df_train['MARRIAGE'].astype(int)
df_test['MARRIAGE'] = df_test['MARRIAGE'].astype(int)

In [9]:
df_train.SEX.value_counts()

SEX
2    12664
1     8289
Name: count, dtype: int64

In [10]:
df_train.EDUCATION.value_counts()

EDUCATION
2    9756
1    7476
3    3396
4     325
Name: count, dtype: int64

In [11]:
df_train.MARRIAGE.value_counts()

MARRIAGE
2    11226
1     9502
3      225
Name: count, dtype: int64

In [12]:
df_train.default.value_counts()

default
0    16228
1     4725
Name: count, dtype: int64

In [13]:
# Dividir los datasets
y_test = df_test['default']
x_test = df_test.drop('default', axis = 1)

y_train= df_train['default']
x_train = df_train.drop('default', axis = 1)

cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE']

column_transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), [])  
], remainder='passthrough')

In [14]:

pipeline_rf = Pipeline([
    ('codificacion', column_transformer),
    ('modelo', RandomForestClassifier())
])



In [31]:
param_grid = {
    'modelo__n_estimators': [250],
    'modelo__max_depth': [24],
    'modelo__max_features': ['sqrt'],
    'modelo__min_samples_leaf': [2],
    'modelo__min_samples_split':[2]
    
}
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor balanced accuracy (validación):", grid_search.best_score_)

Mejores hiperparámetros: {'modelo__max_depth': 24, 'modelo__max_features': 'sqrt', 'modelo__min_samples_leaf': 2, 'modelo__min_samples_split': 2, 'modelo__n_estimators': 250}
Mejor balanced accuracy (validación): 0.6592027078613903


In [34]:
os.makedirs("../files/models", exist_ok=True)

# Guardar el modelo entrenado
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid_search, f)

In [33]:

os.makedirs("../files/output", exist_ok=True)

metrics_output = []

# Predicciones
y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

# Función auxiliar para métricas
def calcular_metricas(y_true, y_pred, dataset):
    return {
        'type': 'metrics',  
        'dataset': dataset,
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0)
    }

# Función auxiliar para matriz de confusión
def calcular_cm(y_true, y_pred, dataset):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

# Calcular y almacenar métricas y CM
metrics_output.append(calcular_metricas(y_train, y_train_pred, 'train'))
metrics_output.append(calcular_metricas(y_test, y_test_pred, 'test'))

metrics_output.append(calcular_cm(y_train, y_train_pred, 'train'))
metrics_output.append(calcular_cm(y_test, y_test_pred, 'test'))

# Guardar en JSON (una línea por objeto)
with open("../files/output/metrics.json", "w") as f:
    for item in metrics_output:
        json.dump(item, f)
        f.write("\n")

In [None]:
# Ruta al archivo .pkl
ruta = "../files/grading/x_train.pkl"

# Cargar el DataFrame
df = pd.read_pickle(ruta)

# Verificar que se cargó correctamente
df.SEX.value_counts()

SEX
2    12664
1     8289
Name: count, dtype: int64

In [None]:
df.EDUCATION.value_counts()

EDUCATION
2    9756
1    7476
3    3396
4     325
Name: count, dtype: int64

In [None]:
df.MARRIAGE.value_counts()

MARRIAGE
2    11226
1     9502
3      225
Name: count, dtype: int64

In [None]:
df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,310000,1,3,1,32,0,0,0,0,0,...,110375,84373,57779,14163,8295,6000,4000,3000,1000,2000
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,-748,1690,1138,930,0,0,2828,0,182,0
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,46257,45975,1300,43987,0,46257,2200,1300,43987,1386
3,80000,2,3,1,52,2,2,3,3,3,...,40101,40748,39816,40607,3700,1600,1600,0,1600,1600
4,270000,1,1,2,34,1,2,0,0,2,...,20924,22448,15490,17343,0,4000,2000,0,2000,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,140000,2,2,1,27,2,-1,-1,-1,0,...,776,1580,804,728,752,800,1580,0,700,700
20996,130000,1,2,2,41,0,0,0,0,0,...,123746,123107,42897,39378,4442,5200,5012,2500,5000,2000
20997,50000,1,3,2,23,0,0,0,0,0,...,36023,28967,29829,30046,1973,1426,1001,1432,1062,997
20998,90000,2,3,2,25,0,0,0,0,0,...,5613,5613,10113,10113,3000,3000,0,4500,0,3440


In [None]:
df_train.to_csv("train")