In [1]:
import pandas as pd
import pickle
import os
import gzip
import json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
print("Imports completed")

Imports completed


In [2]:
test_data = pd.read_csv("../files/input/test_data.csv.zip",index_col=False,compression="zip")
train_data = pd.read_csv("../files/input/train_data.csv.zip",index_col=False,compression="zip")
print("Data loaded")

Data loaded


In [3]:
#paso 1
test_data = test_data.rename(columns={'default payment next month': 'default'})
train_data = train_data.rename(columns={'default payment next month': 'default'})
train_data = train_data.loc[train_data["MARRIAGE"] != 0]
train_data = train_data.loc[train_data["EDUCATION"] != 0]
test_data = test_data.loc[test_data["MARRIAGE"] != 0]
test_data = test_data.loc[test_data["EDUCATION"] != 0]
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data=test_data.drop(columns=['ID'])
train_data=train_data.drop(columns=['ID'])
print("Data cleaned")

Data cleaned


In [4]:
x_train=train_data.drop(columns="default")
y_train=train_data["default"]
x_test=test_data.drop(columns="default")
y_test=test_data["default"]
print("Data splitted")

Data splitted


In [5]:

categorical_features = ["SEX", "EDUCATION", "MARRIAGE"]
numerical_features = [col for col in x_train.columns if col not in categorical_features]


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('scaler', StandardScaler(), numerical_features),
    ]
)


pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('pca', PCA()),
    ('classifier', MLPClassifier(random_state=21, max_iter=15000))
])
print("Pipeline created")

Pipeline created


In [6]:

param_grid = {
        "pca__n_components": [None],
        "feature_selection__k": [20],
        "classifier__hidden_layer_sizes": [(50, 30, 40, 60)],
        "classifier__alpha": [0.26],
        'classifier__learning_rate_init': [0.001],
    }


model = GridSearchCV(
    pipeline,
    param_grid,
    cv=10, 
    scoring='balanced_accuracy', 
    refit=True, 
    n_jobs=-1,
    verbose=2,
)

# Entrenar el modelo
model.fit(x_train, y_train)
print("Model trained")

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Model trained


In [7]:
models_dir = '../files/models'
os.makedirs(models_dir, exist_ok=True)

with gzip.open("../files/models/model.pkl.gz", "wb") as file:
    pickle.dump(model, file)

In [8]:

def calculate_metrics(y_true, y_pred, dataset_name):
    # Métricas
    metrics = {
        "type": "metrics",  
        "dataset": dataset_name,
        "precision": precision_score(y_true, y_pred, average='binary'),         
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),          
        "recall": recall_score(y_true, y_pred, average='binary'),              
        "f1_score": f1_score(y_true, y_pred, average='binary')                 
    }
    return metrics


y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_metrics = calculate_metrics(y_train, y_train_pred, 'train')
test_metrics = calculate_metrics(y_test, y_test_pred, 'test')

output_path = "../files/output/metrics.json"
os.makedirs(os.path.dirname(output_path), exist_ok=True)  

with open(output_path, 'w') as f:
    f.write(json.dumps(train_metrics) + '\n')  
    f.write(json.dumps(test_metrics) + '\n')  

print(f"Métricas guardadas en: {output_path}")

Métricas guardadas en: ../files/output/metrics.json


In [9]:
from sklearn.metrics import confusion_matrix

def calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

   
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    
    def format_confusion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0, 0]),
                'predicted_1': int(cm[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm[1, 0]),
                'predicted_1': int(cm[1, 1])
            }
        }

    metrics = [
        format_confusion_matrix(cm_train, 'train'),
        format_confusion_matrix(cm_test, 'test')
    ]

    
    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f:  
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')


def main(model, X_train, X_test, y_train, y_test):
    
    import os
    os.makedirs('../files/output', exist_ok=True)

    
    train_metrics = calculate_metrics(y_train, y_train_pred, 'train')
    test_metrics = calculate_metrics(y_test, y_test_pred, 'test')

    calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test)

main(model, x_train, x_test, y_train, y_test)