In [47]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
import pickle
import os
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score, make_scorer, balanced_accuracy_score, accuracy_score, classification_report
import json
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer

In [2]:
def load_data(df_path):
    df = pd.read_csv(df_path)
    return df

In [3]:
def clean_data(df: pd.DataFrame):
    df = df.rename(columns={"default payment next month":"default"})
    df = df.drop(columns=['ID'])
    df = df.dropna()
    df.loc[df['EDUCATION']>4, 'EDUCATION'] = 4
    return df

In [5]:
traindf_path = '../files/input/train_default_of_credit_card_clients.csv'
testdf_path = '../files/input/test_default_of_credit_card_clients.csv'
train_df = load_data(traindf_path)
test_df = load_data(testdf_path)

In [6]:
train_cleaned = clean_data(train_df)
test_cleaned = clean_data(test_df)

In [7]:
X_train = train_cleaned.drop(columns=['default'])
y_train = train_cleaned['default']

X_test = test_cleaned.drop(columns=['default'])
y_test = test_cleaned['default']

In [8]:
categorical = ['SEX', 'EDUCATION', 'MARRIAGE']
numeric = [col for col in X_train.columns if col not in categorical]

In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=None)),  # Use all components
    ('select', SelectKBest(score_func=f_classif)),  # Feature selection
    ('classifier', MLPClassifier(max_iter=1000, random_state=42))  # MLP
])

In [67]:
param_grid = {
    'select__k': [10, 15, 20],  # Number of features to select
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],  # MLP architectures
    'classifier__alpha': [0.0001, 0.001],  # Regularization parameter
    'classifier__learning_rate_init': [0.001, 0.01]  # Learning rate
}


In [68]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,                     # bajar a 5 folds
    scoring="balanced_accuracy",
    n_jobs=-1,
    verbose=2
)
grid.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__alpha': [0.0001, 0.001], 'classifier__hidden_layer_sizes': [(50,), (100,), ...], 'classifier__learning_rate_init': [0.001, 0.01], 'select__k': [10, 15, ...]}"
,scoring,'balanced_accuracy'
,n_jobs,-1
,refit,True
,cv,10
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_components,
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,score_func,<function f_c...001A3BD132B60>
,k,20

0,1,2
,hidden_layer_sizes,"(50,)"
,activation,'relu'
,solver,'adam'
,alpha,0.001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,1000
,shuffle,True


In [69]:
os.makedirs("../files/models", exist_ok=True)
model_path = "../files/models/model.pkl.gz"
with gzip.open(model_path, "wb") as f:
    pickle.dump(grid, f)

In [70]:
def compute_metrics(y_true, y_pred, dataset_name):
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1_score": f1_score(y_true, y_pred, zero_division=0),
    }

y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

metrics_train = compute_metrics(y_train, y_pred_train, "train")
metrics_test = compute_metrics(y_test, y_pred_test, "test")

def compute_cm(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])},
    }

cm_train = compute_cm(y_train, y_pred_train, "train")
cm_test = compute_cm(y_test, y_pred_test, "test")

In [71]:
output_dir = "../files/output"
output_file = os.path.join(output_dir, "metrics.json")

os.makedirs(output_dir, exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    for m in [metrics_train, metrics_test, cm_train, cm_test]:
        f.write(json.dumps(m) + "\n")