In [1]:
import zipfile
import os
import pandas as pd
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score
import gzip
import pickle
import json
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score



In [2]:
# cargar los datos
train_path = "../files/input/train_data.csv.zip"
test_path = "../files/input/test_data.csv.zip"
extract_path = "../files/input/"


In [3]:
if not os.path.exists("../files/input/train_default_of_credit_card_clients.csv"):
  with zipfile.ZipFile(train_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)

if not os.path.exists("../files/input/test_default_of_credit_card_clients.csv"):
  with zipfile.ZipFile(test_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [4]:
train = pd.read_csv("../files/input/train_default_of_credit_card_clients.csv")
test = pd.read_csv("../files/input/test_default_of_credit_card_clients.csv")

## Paso 1: limpieza de los datasets

In [5]:
#Renombre la columna "default payment next month" a "default"
train = train.rename(columns={"default payment next month": "default"})
test  = test.rename(columns={"default payment next month": "default"})

In [6]:
#Remueva la columna "ID"
train = train.drop(columns=["ID"])
test  = test.drop(columns=["ID"])

In [7]:
#Para la columna EDUCATION, valores > 4 indican niveles superiores
# de educación, agrupe estos valores en la categoría "others"
def limpiar_df(df):
    # Convertir EDUCATION a numérico
    df["EDUCATION"] = pd.to_numeric(df["EDUCATION"], errors="coerce").fillna(4)

    # Reemplazar valores inválidos por 4
    df["EDUCATION"] = df["EDUCATION"].astype(int).clip(1, 4)
    df.loc[df["EDUCATION"] == 0] = 4

    # Eliminar registros con MARRIAGE == 0
    df = df[df["MARRIAGE"] != 0]

    return df

train = limpiar_df(train)
test = limpiar_df(test)


In [8]:
#Elimine los registros con informacion no disponible
train = train.dropna()
test  = test.dropna()

In [9]:
#Analisis exploratorio de datos
train.info()
train.describe()

test.info()
test.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 20962 entries, 0 to 20999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   LIMIT_BAL  20962 non-null  int64
 1   SEX        20962 non-null  int64
 2   EDUCATION  20962 non-null  int64
 3   MARRIAGE   20962 non-null  int64
 4   AGE        20962 non-null  int64
 5   PAY_0      20962 non-null  int64
 6   PAY_2      20962 non-null  int64
 7   PAY_3      20962 non-null  int64
 8   PAY_4      20962 non-null  int64
 9   PAY_5      20962 non-null  int64
 10  PAY_6      20962 non-null  int64
 11  BILL_AMT1  20962 non-null  int64
 12  BILL_AMT2  20962 non-null  int64
 13  BILL_AMT3  20962 non-null  int64
 14  BILL_AMT4  20962 non-null  int64
 15  BILL_AMT5  20962 non-null  int64
 16  BILL_AMT6  20962 non-null  int64
 17  PAY_AMT1   20962 non-null  int64
 18  PAY_AMT2   20962 non-null  int64
 19  PAY_AMT3   20962 non-null  int64
 20  PAY_AMT4   20962 non-null  int64
 21  PAY_AMT5   20962 

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
count,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,...,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0,8984.0
mean,168166.482636,1.601514,1.84695,1.548531,35.46683,-0.029831,-0.128228,-0.162066,-0.217498,-0.263357,...,42705.577248,39524.768811,37913.817676,5495.134016,6108.07,5264.100401,4658.682769,4637.101736,5115.508014,0.212155
std,130090.165042,0.489614,0.738001,0.519126,9.137198,1.130867,1.200592,1.212299,1.190009,1.151494,...,63285.362148,59772.497749,58221.426363,15051.544479,22971.57,16479.038101,14950.070058,14908.794881,16514.228476,0.408857
min,10000.0,1.0,1.0,1.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-61372.0,-57060.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,2188.5,1773.75,1268.5,931.75,824.0,409.25,315.5,266.0,200.0,0.0
50%,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,19033.0,17913.5,16721.0,2100.0,2000.0,1777.0,1500.0,1500.0,1500.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,54762.25,50056.25,48652.25,5026.0,5000.0,4451.5,4000.0,4000.0,4026.5,0.0
max,780000.0,2.0,4.0,3.0,75.0,8.0,7.0,7.0,8.0,7.0,...,572805.0,823540.0,527711.0,493358.0,1227082.0,417588.0,497000.0,417990.0,351282.0,1.0


## Paso 2: Divida los datasets 

In [10]:
# Divida los datasets en x_train, y_train, x_test, y_test

X_train = train.drop(columns=["default"])  
y_train = train["default"]                  


X_test = test.drop(columns=["default"])
y_test = test["default"]

## Paso 3: Crear un pipeline para el modelo de clasificación.

In [11]:
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Ajusta un modelo de bosques aleatorios (rando forest).

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="passthrough"   
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42, n_jobs=-1))
])

## Paso 4: Optimizar los hiperparametros del pipeline.

In [12]:
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

param_grid = {
    "classifier__n_estimators": [100],
    "classifier__max_depth": [None],
    "classifier__min_samples_split": [10],
    'classifier__min_samples_leaf': [4], 
    "classifier__max_features": [25],
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True,    
    verbose=2       
)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__max_depth': [None], 'classifier__max_features': [25], 'classifier__min_samples_leaf': [4], 'classifier__min_samples_split': [10], ...}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,25
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
print("Mejores hiperparámetros:", grid_search.best_params_)
print("Best balanced accuracy:", grid_search.best_score_)

Mejores hiperparámetros: {'classifier__max_depth': None, 'classifier__max_features': 25, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best balanced accuracy: 0.6576023163035412


In [14]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Balanced Accuracy en test:", balanced_accuracy_score(y_test, y_pred))

Balanced Accuracy en test: 0.6717293761880434


## Paso 5: Guardar el modelo

In [15]:
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

# Crear carpeta destino si no existe
os.makedirs("../files/models", exist_ok=True)

# Guardar modelo comprimido en formato .pkl.gz
with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(grid_search, file)

## Paso 6: Calcular las metricas de precision

In [16]:
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba.

def compute_metrics(model, x_train, y_train, x_test, y_test):
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    metrics = [
        {
            'type': 'metrics',
            'dataset': 'train',
            'precision': precision_score(y_train, y_train_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
            'recall': recall_score(y_train, y_train_pred, zero_division=0),
            'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
        },
        {
            'type': 'metrics',
            'dataset': 'test',
            'precision': precision_score(y_test, y_test_pred, zero_division=0),
            'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred, zero_division=0),
            'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
        },
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return metrics

In [17]:
metrics = compute_metrics(best_model, X_train, y_train, X_test, y_test)

In [18]:
# Crear carpeta destino si no existe
os.makedirs("../files/output", exist_ok=True)

# Guardar metricas en archivo JSON
with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')

## Paso 7: Calcular las matrices de confusion

In [19]:
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba.

def calc_confusion_matrices(model, x_train, y_train, x_test, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    confusion_matrices = [
        {
            'type': 'cm_matrix',
            'dataset': 'train',
            'true_0': {'predicted_0': int(cm_train[0, 0]), 'predicted_1': int(cm_train[0, 1])},
            'true_1': {'predicted_0': int(cm_train[1, 0]), 'predicted_1': int(cm_train[1, 1])}
        },
        {
            'type': 'cm_matrix',
            'dataset': 'test',
            'true_0': {'predicted_0': int(cm_test[0, 0]), 'predicted_1': int(cm_test[0, 1])},
            'true_1': {'predicted_0': int(cm_test[1, 0]), 'predicted_1': int(cm_test[1, 1])}
        }
    ]

    return confusion_matrices

In [21]:
matrices_confusion = calc_confusion_matrices(best_model, X_train, y_train, X_test, y_test)

In [22]:
# Guardar metricas en archivo JSON
with open("../files/output/metrics.json", "w") as file:
        for metric in metrics:
            file.write(json.dumps(metric, ensure_ascii=False))
            file.write('\n')