In [1]:
# importacion de datasets

import pandas as pd

train = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
test = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

In [2]:
# Paso 1: Limpieza de datos

def preprocess_data(df):
    df = df.rename(columns={'default payment next month': 'default'})
    df = df.drop('ID', axis=1)
    df = df.dropna()
    df = df[(df['EDUCATION'] != 0 ) & (df['MARRIAGE'] != 0)]
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x : 4 if x > 4 else x)

    return df

train = preprocess_data(train)
test = preprocess_data(test)

In [3]:
# Paso 2: Dividir el dataset en conjunto de entrenamiento y prueba

x_train = train.drop('default', axis=1)
y_train = train['default']
x_test = test.drop('default', axis=1)
y_test = test['default']

In [None]:
# Paso 3: Crear pipeline

from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer


# Define the column transformer for categorical variables
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = x_train.columns.difference(categorical_features + ['default'])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', MinMaxScaler(), x_train.columns.difference(categorical_features + ['default']))
    ]
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=None)),
    ('selectkbest', SelectKBest(score_func=f_classif, k=1)),
    ('classifier', MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
])

# Fit the pipeline
pipeline.fit(x_train, y_train)
print('Train score: ', balanced_accuracy_score(y_train, pipeline.predict(x_train)))
print('Test score: ', balanced_accuracy_score(y_test, pipeline.predict(x_test)))

Train score:  0.6025380653726722
Test score:  0.6114921084555325


In [5]:
# Paso 4: Optimizar hiperparametros
from sklearn.model_selection import GridSearchCV

param_grid = {
    'pca__n_components': [5, 10, 15],
    'classifier__hidden_layer_sizes': [(50,), (100,), (150,)],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__learning_rate_init': [0.001, 0.01, 0.1]
}

param_grid_prueba = {
    'pca__n_components': [5],
    'classifier__hidden_layer_sizes': [(50,)],
    'classifier__alpha': [0.0001],
    'classifier__learning_rate_init': [0.001]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid_prueba, cv=10, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best balanced accuracy score: ", grid_search.best_score_)

Best parameters found:  {'classifier__alpha': 0.0001, 'classifier__hidden_layer_sizes': (50,), 'classifier__learning_rate_init': 0.001, 'pca__n_components': 5}
Best balanced accuracy score:  0.599636029273277


In [6]:
# Paso 5: Guardar el modelo

import os
import gzip
import pickle

os.makedirs('../files/models', exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [7]:
# Paso 6 y 7: Calculo de metricas y matriz de confusion

def metrics_report(y_test, y_train):
    from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

    y_test_predict = grid_search.predict(x_test)
    y_train_predict = grid_search.predict(x_train)

    cm_train = confusion_matrix(y_train, y_train_predict)
    cm_test = confusion_matrix(y_test, y_test_predict)

    metrics = [
        {'type': 'metrics', 'dataset': 'train', 'precision': precision_score(y_train, y_train_predict), 'balanced_accuracy': balanced_accuracy_score(y_train, y_train_predict), 'recall': recall_score(y_train, y_train_predict), 'f1_score': f1_score(y_train, y_train_predict)},
        {'type': 'metrics', 'dataset': 'test', 'precision': precision_score(y_test, y_test_predict), 'balanced_accuracy': balanced_accuracy_score(y_test, y_test_predict), 'recall': recall_score(y_test, y_test_predict), 'f1_score': f1_score(y_test, y_test_predict)},
        {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": int(cm_train[0][0]), "predicted_1": int(cm_train[0][1])}, 'true_1': {"predicted_0": int(cm_train[1][0]), "predicted_1": int(cm_train[1][1])}},
        {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": int(cm_test[0][0]), "predicted_1": int(cm_test[0][1])}, 'true_1': {"predicted_0": int(cm_test[1][0]), "predicted_1": int(cm_test[1][1])}}
    ]

    return metrics

results = metrics_report(y_test, y_train)


In [8]:
# Paso 8: Guardar metricas

import json
os.makedirs('../files/output', exist_ok=True)
with open('../files/output/metrics.json', 'w') as f:
    for metric in results:
        f.write(json.dumps(metric) + '\n')

In [9]:
TESTING = [
    {
        "type": "metrics",
        "dataset": "train",
        "precision": 0.691,
        "balanced_accuracy": 0.661,
        "recall": 0.370,
        "f1_score": 0.482,
    },
    {
        "type": "metrics",
        "dataset": "test",
        "precision": 0.673,
        "balanced_accuracy": 0.661,
        "recall": 0.370,
        "f1_score": 0.482,
    },
    {
        "type": "cm_matrix",
        "dataset": "train",
        "true_0": {"predicted_0": 15440, "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": 1735},
    },
    {
        "type": "cm_matrix",
        "dataset": "test",
        "true_0": {"predicted_0": 6710, "predicted_1": None},
        "true_1": {"predicted_0": None, "predicted_1": 730},
    },
]

In [10]:
def comparison(metrics, test):
    keys = ['precision', 'balanced_accuracy', 'recall', 'f1_score']
    for key in keys:
        for metric, test_metric in zip(metrics, test):
            if key in metric and key in test_metric:
                if metric[key] is not None and test_metric[key] is not None:
                    if metric[key] > test_metric[key]:
                        print(f'{key} ==> Better')
                    else:
                        print(f'{key} ==> Worse')

    for metric, test_metric in zip(metrics, test):
        if metric['type'] == 'cm_matrix' and test_metric['type'] == 'cm_matrix':
            for key in ['true_0', 'true_1']:
                for subkey in ['predicted_0', 'predicted_1']:
                    if metric[key][subkey] is not None and test_metric[key][subkey] is not None:
                        if metric[key][subkey] > test_metric[key][subkey]:
                            print(f'{key} ==> Better')
                        else:
                            print(f'{key} ==> Worse')

comparison(results, TESTING)

precision ==> Worse
precision ==> Worse
balanced_accuracy ==> Worse
balanced_accuracy ==> Worse
recall ==> Worse
recall ==> Worse
f1_score ==> Worse
f1_score ==> Worse
true_0 ==> Better
true_1 ==> Worse
true_0 ==> Better
true_1 ==> Worse


In [15]:
def comparar_metricas(metrics, testing):
    keys = ['precision', 'balanced_accuracy', 'recall', 'f1_score']
    for key in keys:
        for metric, test_metric in zip(metrics, testing):
            if key in metric and key in test_metric:
                if metric[key] is not None and test_metric[key] is not None:
                    print(f'{key} My value: {metric[key]}, Test value: {test_metric[key]}')
                    if metric[key] > test_metric[key]:
                        print(f'==> Better \n')
                    else:
                        print(f'==> Worse \n')

    for metric, test_metric in zip(metrics, testing):
        if metric['type'] == 'cm_matrix' and test_metric['type'] == 'cm_matrix':
            for key in ['true_0', 'true_1']:
                for subkey in ['predicted_0', 'predicted_1']:
                    if metric[key][subkey] is not None and test_metric[key][subkey] is not None:
                        print(f'{key} {subkey} My value: {metric[key][subkey]}, Test value: {test_metric[key][subkey]}')
                        if metric[key][subkey] > test_metric[key][subkey]:
                            print(f' ==> Better \n')
                        else:
                            print(f' ==> Worse \n')

comparar_metricas(results, TESTING)

precision My value: 0.6437224669603524, Test value: 0.691
==> Worse 

precision My value: 0.6377358490566037, Test value: 0.673
==> Worse 

balanced_accuracy My value: 0.603769022905084, Test value: 0.661
==> Worse 

balanced_accuracy My value: 0.612641937201444, Test value: 0.661
==> Worse 

recall My value: 0.2474074074074074, Test value: 0.37
==> Worse 

recall My value: 0.2660020986358867, Test value: 0.37
==> Worse 

f1_score My value: 0.35743770065739183, Test value: 0.482
==> Worse 

f1_score My value: 0.37541651240281376, Test value: 0.482
==> Worse 

true_0 predicted_0 My value: 15581, Test value: 15440
 ==> Better 

true_1 predicted_1 My value: 1169, Test value: 1735
 ==> Worse 

true_0 predicted_0 My value: 6785, Test value: 6710
 ==> Better 

true_1 predicted_1 My value: 507, Test value: 730
 ==> Worse 

