In [1]:
# importacion de datasets

import pandas as pd

train = pd.read_csv('../files/input/train_data.csv.zip', compression='zip')
test = pd.read_csv('../files/input/test_data.csv.zip', compression='zip')

In [2]:
# Paso 1: Limpieza de datos

def preprocess_data(df):
    df = df.rename(columns={'default payment next month': 'default'})
    df = df.drop('ID', axis=1)
    df = df.dropna()
    df = df[(df['EDUCATION'] != 0 ) & (df['MARRIAGE'] != 0)]
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x : 4 if x > 4 else x)

    return df

train = preprocess_data(train)
test = preprocess_data(test)

In [3]:
# Paso 2: Dividir el dataset en conjunto de entrenamiento y prueba

x_train = train.drop('default', axis=1)
y_train = train['default']
x_test = test.drop('default', axis=1)
y_test = test['default']

In [4]:
# Paso 3: Crear pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
continouos_features = train.columns.difference(categorical_features + ['default'])

transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

pipeline = Pipeline([
    ('transformer', transformer),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(x_train, y_train)
print('Train score:', balanced_accuracy_score(y_train, pipeline.predict(x_train)))
print('Test score:', balanced_accuracy_score(y_test, pipeline.predict(x_test)))
print('Params:', pipeline.get_params())


Train score: 0.9989551862154771
Test score: 0.6731610120747966
Params: {'memory': None, 'steps': [('transformer', ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])), ('classifier', RandomForestClassifier())], 'transform_input': None, 'verbose': False, 'transformer': ColumnTransformer(remainder='passthrough',
                  transformers=[('onehot',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 ['SEX', 'EDUCATION', 'MARRIAGE'])]), 'classifier': RandomForestClassifier(), 'transformer__force_int_remainder_cols': True, 'transformer__n_jobs': None, 'transformer__remainder': 'passthrough', 'transformer__sparse_threshold': 0.3, 'transformer__transformer_weights': None, 'transformer__transformers': [('onehot', OneHotEncoder(handle_unknown='ignore'), ['SEX', 

In [5]:
# Paso 4: Optimizar hiperparametros

from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [10, 20, 50],
    'classifier__max_depth': [None],
}

param_grid_2 = {
    'classifier__bootstrap': [True], 
    'classifier__ccp_alpha': [0.0], 
    'classifier__class_weight': [None], 
    'classifier__criterion': ['gini'], 
    'classifier__max_depth': [None], 
    'classifier__max_features': ['sqrt'], 
    'classifier__max_leaf_nodes': [None], 
    'classifier__max_samples': [None], 
    'classifier__min_impurity_decrease': [0.0], 
    'classifier__min_samples_leaf': [1], 
    'classifier__min_samples_split': [2], 
    'classifier__min_weight_fraction_leaf': [0.0], 
    'classifier__monotonic_cst': [None], 
    'classifier__n_estimators': [100], 
    'classifier__n_jobs': [None], 
    'classifier__oob_score': [False], 
    'classifier__random_state': [None], 
    'classifier__verbose': [0], 
    'classifier__warm_start': [False]
}

param_grid_3 = {
    'transformer__onehot__handle_unknown': ['ignore'],
    'transformer__onehot__sparse_output': [True],
    'classifier__bootstrap': [True],
    'classifier__ccp_alpha': [0.0],
    'classifier__class_weight': [None],
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [None],
    'classifier__max_features': ['sqrt'],
    'classifier__max_leaf_nodes': [None],
    'classifier__max_samples': [None],
    'classifier__min_impurity_decrease': [0.0],
    'classifier__min_samples_leaf': [1],
    'classifier__min_samples_split': [2],
    'classifier__min_weight_fraction_leaf': [0.0],
    'classifier__monotonic_cst': [None],
    'classifier__n_estimators': [100],
    'classifier__n_jobs': [None],
    'classifier__oob_score': [False],
    'classifier__random_state': [None],
    'classifier__verbose': [0],
    'classifier__warm_start': [False]
}

param_grid_4 = {
    'classifier__bootstrap': [True], 
    'classifier__criterion': ['gini'], 
    'classifier__max_depth': [None], 
    'classifier__max_features': ['sqrt'], 
    'classifier__max_leaf_nodes': [None], 
    'classifier__max_samples': [None], 
    'classifier__min_samples_leaf': [2], 
    'classifier__min_samples_split': [2], 
    'classifier__n_estimators': [100], 
}


grid_search = GridSearchCV(pipeline, param_grid_4, cv=10, n_jobs=-1, scoring='balanced_accuracy', verbose=2)
grid_search.fit(x_train, y_train)
print('Best Score:', grid_search.best_score_)
print('Best Params:', grid_search.best_params_)
print('Test Score:', grid_search.score(x_test, y_test))
print('Train Score:', grid_search.score(x_train, y_train))

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best Score: 0.6559933413823907
Best Params: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': None, 'classifier__max_samples': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Test Score: 0.6767237305930701
Train Score: 0.9267911885264608


In [6]:
# Paso 5: Guardar el modelo

import os
import gzip
import pickle

os.makedirs('../files/models', exist_ok=True)
with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
    pickle.dump(grid_search, f)

In [7]:
# Paso 6 y 7: Calculo de metricas y matriz de confusion

def metrics_report(y_test, y_train):
    from sklearn.metrics import confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score

    y_test_predict = grid_search.predict(x_test)
    y_train_predict = grid_search.predict(x_train)

    cm_train = confusion_matrix(y_train, y_train_predict)
    cm_test = confusion_matrix(y_test, y_test_predict)

    metrics = [
        {'type': 'metrics', 'dataset': 'train', 'precision': precision_score(y_train, y_train_predict), 'balanced_accuracy': balanced_accuracy_score(y_train, y_train_predict), 'recall': recall_score(y_train, y_train_predict), 'f1_score': f1_score(y_train, y_train_predict)},
        {'type': 'metrics', 'dataset': 'test', 'precision': precision_score(y_test, y_test_predict), 'balanced_accuracy': balanced_accuracy_score(y_test, y_test_predict), 'recall': recall_score(y_test, y_test_predict), 'f1_score': f1_score(y_test, y_test_predict)},
        {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": int(cm_train[0][0]), "predicted_1": int(cm_train[0][1])}, 'true_1': {"predicted_0": int(cm_train[1][0]), "predicted_1": int(cm_train[1][1])}},
        {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": int(cm_test[0][0]), "predicted_1": int(cm_test[0][1])}, 'true_1': {"predicted_0": int(cm_test[1][0]), "predicted_1": int(cm_test[1][1])}}
    ]

    return metrics

results = metrics_report(y_test, y_train)


In [8]:
# Paso 8: Guardar metricas

import json
os.makedirs('../files/output', exist_ok=True)
with open('../files/output/metrics.json', 'w') as f:
    for metric in results:
        f.write(json.dumps(metric) + '\n')