In [257]:
# importacion de datasets

import pandas as pd

train = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
test = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

In [258]:
# Paso 1: Limpieza de datos

def preprocess_data(df):
    df = df.rename(columns={'default payment next month': 'default'})
    df = df.drop('ID', axis=1)
    df = df.dropna()
    df = df[(df['EDUCATION'] != 0 ) & (df['MARRIAGE'] != 0)]
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x : 4 if x > 4 else x)

    return df

train = preprocess_data(train)
test = preprocess_data(test)

In [259]:
# Paso 2: Dividir el dataset en conjunto de entrenamiento y prueba

x_train = train.drop('default', axis=1)
y_train = train['default']
x_test = test.drop('default', axis=1)
y_test = test['default']

In [260]:
# Paso 3: Crear pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous_features = train.columns.difference(categorical_features + ['default'])

transformer = ColumnTransformer([
    ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('scaler', MinMaxScaler(), continuous_features)
], remainder='passthrough')

pipeline = Pipeline(steps=[
    ('preprocessor', transformer),
    ('feature_selector', SelectKBest(score_func=f_classif, k=1)),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline.fit(x_train, y_train)
print('Train score:', pipeline.score(x_train, y_train))
print('Test score:', pipeline.score(x_test, y_test))

Train score: 0.8147759270748819
Test score: 0.8303820024501615


In [261]:
# Paso 4: Optimizar hiperparámetros

from sklearn.model_selection import GridSearchCV

param_grid = {
        'classifier__C': [1],
        'classifier__solver': ['lbfgs'],
        'classifier__penalty': ['l2'],
        'feature_selector__k': [1],
    }

grid_search = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=-1, verbose=1, scoring='balanced_accuracy')
grid_search.fit(x_train, y_train)
print('Best score:', grid_search.best_score_)
print('Best params:', grid_search.best_params_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best score: 0.6392688664250823
Best params: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs', 'feature_selector__k': 1}


In [262]:
# Paso 5: Guardar el modelo

import os
import gzip
import pickle

os.makedirs('../files/models', exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [263]:
# Paso 6 y 7: Calculo de metricas y matriz de confusion

def metrics_report(y_test, y_train):
    from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

    y_test_predict = grid_search.predict(x_test)
    y_train_predict = grid_search.predict(x_train)

    cm_train = confusion_matrix(y_train, y_train_predict)
    cm_test = confusion_matrix(y_test, y_test_predict)

    metrics = [
        {'type': 'metrics', 'dataset': 'train', 'precision': precision_score(y_train, y_train_predict), 'balanced_accuracy': balanced_accuracy_score(y_train, y_train_predict), 'recall': recall_score(y_train, y_train_predict), 'f1_score': f1_score(y_train, y_train_predict)},
        {'type': 'metrics', 'dataset': 'test', 'precision': precision_score(y_test, y_test_predict), 'balanced_accuracy': balanced_accuracy_score(y_test, y_test_predict), 'recall': recall_score(y_test, y_test_predict), 'f1_score': f1_score(y_test, y_test_predict)},
        {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": int(cm_train[0][0]), "predicted_1": int(cm_train[0][1])}, 'true_1': {"predicted_0": int(cm_train[1][0]), "predicted_1": int(cm_train[1][1])}},
        {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": int(cm_test[0][0]), "predicted_1": int(cm_test[0][1])}, 'true_1': {"predicted_0": int(cm_test[1][0]), "predicted_1": int(cm_test[1][1])}}
    ]

    return metrics

results = metrics_report(y_test, y_train)


In [264]:
# Paso 8: Guardar metricas

import json
os.makedirs('../files/output', exist_ok=True)
with open('../files/output/metrics.json', 'w') as f:
    for metric in results:
        f.write(json.dumps(metric) + '\n')