In [20]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score, make_scorer, balanced_accuracy_score, accuracy_score, classification_report
import json
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from collections import OrderedDict
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [21]:
def load_data(df_path):
    df = pd.read_csv(df_path)
    return df

In [22]:
def clean_data(df: pd.DataFrame):
    df = df.rename(columns={"default payment next month":"default"})
    df = df.drop(columns=['ID'])
    df = df.dropna()
    df.loc[df['EDUCATION']>4, 'EDUCATION'] = 4
    return df

In [23]:
traindf_path = '../files/input/train_default_of_credit_card_clients.csv'
testdf_path = '../files/input/test_default_of_credit_card_clients.csv'
train_df = load_data(traindf_path)
test_df = load_data(testdf_path)

In [24]:
train_cleaned = clean_data(train_df)
test_cleaned = clean_data(test_df)

In [25]:
X_train = train_cleaned.drop(columns=['default'])
y_train = train_cleaned['default']

X_test = test_cleaned.drop(columns=['default'])
y_test = test_cleaned['default']

In [26]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
# - Estandariza la matriz de entrada.
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una maquina de vectores de soporte (svm).



In [27]:
cat_cols = ["SEX", "EDUCATION", "MARRIAGE"]
num_cols = [
    "LIMIT_BAL", "AGE",
    "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6",
    "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"
]

# Transformador de categorías
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# ColumnTransformer para aplicar OHE a cat_cols y dejar pasar num_cols
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# Construcción del pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=None, svd_solver="full")),  # todas las componentes
    ("scaler", StandardScaler()),
    ("selectk", SelectKBest(score_func=f_classif, k="all")),
    ("svc", SVC())
])

In [None]:
param_grid = {
    "svc__C": [1, 10, 100],
    "svc__kernel": ["rbf"],
    "svc__gamma": ["scale", 0.01, 0.001],
    "svc__class_weight": ["balanced"],
    "selectk__k": [40, "all"],
}
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=10,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


0,1,2
,estimator,"Pipeline(step...svc', SVC())])"
,param_grid,"{'selectk__k': ['all', 40], 'svc__C': [1, 10], 'svc__class_weight': ['balanced'], 'svc__kernel': ['linear']}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'full'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,score_func,<function f_c...00212EBB70360>
,k,'all'

0,1,2
,C,1
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [30]:
os.makedirs("../files/models", exist_ok=True)
model_path = "../files/models/model.pkl.gz"
with gzip.open(model_path, "wb") as f:
    pickle.dump(grid, f)

In [31]:
def compute_metrics(y_true, y_pred, dataset_name):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }

y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

metrics_train = compute_metrics(y_train, y_pred_train, "train")
metrics_test = compute_metrics(y_test, y_pred_test, "test")

def compute_cm(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }

cm_train = compute_cm(y_train, y_pred_train, "train")
cm_test = compute_cm(y_test, y_pred_test, "test")

In [34]:
print(metrics_train)

{'type': 'metrics', 'dataset': 'train', 'precision': 0.290437890974084, 'balanced_accuracy': 0.5598844647099285, 'recall': 0.41252379944996825, 'f1_score': 0.3408792937680273}


FileNotFoundError: [Errno 2] No such file or directory: '../files/output/metrics.json'

In [36]:
output_dir = "../files/output"
output_file = os.path.join(output_dir, "metrics.json")

os.makedirs(output_dir, exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    for m in [metrics_train, metrics_test, cm_train, cm_test]:
        f.write(json.dumps(m) + "\n")