In [1]:
import os
import gzip
import json
import pickle

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV


In [2]:
train_data = pd.read_csv("files/input/train_data.csv.zip", compression="zip")
test_data = pd.read_csv("files/input/test_data.csv.zip", compression="zip")

train_data.head(), test_data.head()

(      ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  \
 0  10748     310000    1          3         1   32      0      0      0   
 1  12574      10000    2          3         1   49     -1     -1     -2   
 2  29677      50000    1          2         1   28     -1     -1     -1   
 3   8857      80000    2          3         1   52      2      2      3   
 4  21099     270000    1          1         2   34      1      2      0   
 
    PAY_4  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
 0      0  ...      84373      57779      14163      8295      6000      4000   
 1     -1  ...       1690       1138        930         0         0      2828   
 2      0  ...      45975       1300      43987         0     46257      2200   
 3      3  ...      40748      39816      40607      3700      1600      1600   
 4      0  ...      22448      15490      17343         0      4000      2000   
 
    PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment next

In [3]:
def cleanse(df):
    df = df.copy()
    df.rename(columns={"default payment next month": "default"}, inplace=True)
    df.drop(columns=["ID"], inplace=True)
    df.dropna(inplace=True)
    df = df[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]
    df["EDUCATION"] = df["EDUCATION"].apply(lambda value: 4 if value > 4 else value)
    return df

train_data = cleanse(train_data)
test_data = cleanse(test_data)

train_data.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [4]:
x_train = train_data.drop(columns=["default"])
y_train = train_data["default"]
x_test = test_data.drop(columns=["default"])
y_test = test_data["default"]

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]


In [5]:
def make_pipeline(categorical_cols):
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[("cat", categorical_transformer, categorical_cols)],
        remainder="passthrough",
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(random_state=42)),
        ]
    )
    return pipeline

pipeline = make_pipeline(categorical_features)
pipeline


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
def optimize(pipeline, x_train, y_train):
    param_grid = {
        "classifier__n_estimators": [200],
        "classifier__max_depth": [None],
        "classifier__min_samples_split": [10],
        "classifier__min_samples_leaf": [1, 2],
        "classifier__max_features": ["sqrt"],
    }

    search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=10,
        scoring="balanced_accuracy",
        n_jobs=-1,
        refit=True,
    )
    return search

grid_search = optimize(pipeline, x_train, y_train)


In [7]:
grid_search.fit(x_train, y_train)

print("Mejor balanced accuracy:", grid_search.best_score_)
print("Mejores parámetros:", grid_search.best_params_)


Mejor balanced accuracy: 0.6564609362269683
Mejores parámetros: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}


In [8]:
def metrics_calc(y_true, y_pred, dataset):
    return {
        "dataset": dataset,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }

def matrix_calc(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn, fp, fn, tp


pred_train = grid_search.predict(x_train)
pred_test = grid_search.predict(x_test)

train_metrics = metrics_calc(y_train, pred_train, "train")
test_metrics = metrics_calc(y_test, pred_test, "test")

train_cm = matrix_calc(y_train, pred_train)
test_cm = matrix_calc(y_test, pred_test)

train_metrics, test_metrics, train_cm, test_cm


({'dataset': 'train',
  'precision': 0.9731102479832686,
  'balanced_accuracy': 0.8418830996918254,
  'recall': 0.6893121693121693,
  'f1_score': 0.8069871159563925},
 {'dataset': 'test',
  'precision': 0.6623488773747841,
  'balanced_accuracy': 0.6735663932822289,
  'recall': 0.4024134312696747,
  'f1_score': 0.5006527415143603},
 (np.int64(16138), np.int64(90), np.int64(1468), np.int64(3257)),
 (np.int64(6682), np.int64(391), np.int64(1139), np.int64(767)))