In [3]:
print('Importando librerías...')
import pandas as pd
import gzip
import json
import os
import pickle
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.svm import SVC as svm
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")



# Paso 1.
print('Paso 1...')
# Cargamos los datos
df_train = pd.read_csv('../files/input/train_data.csv.zip', index_col=False, compression="zip")
df_test = pd.read_csv('../files/input/test_data.csv.zip', index_col=False, compression="zip")

# Renombramos la columna "default payment next month" a "default"
df_train = df_train.rename(columns={"default payment next month": "default"})
df_test = df_test.rename(columns={"default payment next month": "default"})

# Removemos la columna "ID"
df_train = df_train.drop(columns=["ID"])
df_test = df_test.drop(columns=["ID"])

# Eliminamos los registros con informacion no disponible
df_train = df_train.dropna()
df_train = df_train[df_train['EDUCATION'] != 0]
df_train = df_train[df_train['MARRIAGE'] != 0]
df_test = df_test.dropna()
df_test = df_test[df_test['EDUCATION'] != 0]
df_test = df_test[df_test['MARRIAGE'] != 0]

# Agrupamos los valores de EDUCATION > 4 en la categoria "others"
df_train.loc[df_train["EDUCATION"] > 4, "EDUCATION"] = 4
df_test.loc[df_test["EDUCATION"] > 4, "EDUCATION"] = 4


# Paso 2.
print('Paso 2...')
# Divida los datasets en x_train, y_train, x_test, y_test.
x_train = df_train.drop(columns=["default"])
y_train = df_train["default"]
x_test = df_test.drop(columns=["default"])
y_test = df_test["default"]


# Paso 3.
print('Paso 3...')
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
# - Estandariza la matriz de entrada.
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una maquina de vectores de soporte (svm).

# Creamos el transformer
transformer = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(dtype="int"), 
         ["SEX", "EDUCATION", "MARRIAGE"]),
    ],
    remainder="passthrough",
)

# Creamos el pipeline
pipeline = Pipeline(
    steps=[
        ("transformer", transformer),
        ("pca", PCA(n_components=22)),
        ("scaler", StandardScaler()),
        ("feature_selection", SelectKBest(score_func=f_classif)),
        ("classifier", MLPClassifier(activation='relu', solver='adam', max_iter=1000)),
    ],
    verbose=False,
)


# Paso 4.
print('Paso 4...')
params = {
    "feature_selection__k": [1],
    "classifier__hidden_layer_sizes": [10],
}
grid = GridSearchCV(pipeline, params, cv=10, scoring='balanced_accuracy', n_jobs=-1, refit=True)
grid.fit(x_train, y_train)
model = grid

Importando librerías...
Paso 1...
Paso 2...
Paso 3...
Paso 4...


In [7]:
for component in model.estimator:
    print(component) 

ColumnTransformer(remainder=MinMaxScaler(),
                  transformers=[('ohe', OneHotEncoder(dtype='int'),
                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])
MinMaxScaler()
PCA(n_components=22)
SelectKBest()
MLPClassifier(max_iter=1000)


In [8]:
MODEL_COMPONENTS = [
    "OneHotEncoder",
    "PCA",
    "StandardScaler",
    "SelectKBest",
    "MLPClassifier",
]

In [10]:
"GridSearchCV" in str(type(model))

True

In [11]:
current_components = [str(model.estimator[i]) for i in range(len(model.estimator))]
current_components

["ColumnTransformer(remainder=MinMaxScaler(),\n                  transformers=[('ohe', OneHotEncoder(dtype='int'),\n                                 ['SEX', 'EDUCATION', 'MARRIAGE'])])",
 'MinMaxScaler()',
 'PCA(n_components=22)',
 'SelectKBest()',
 'MLPClassifier(max_iter=1000)']

In [None]:

current_components = [str(model.estimator[i]) for i in range(len(model.estimator))]
for component in MODEL_COMPONENTS:
    assert any(component in x for x in current_components)

In [1]:
import pandas as pd
df_train = pd.read_csv('../files/input/train_data.csv.zip', index_col=False, compression="zip")
df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,10748,310000,1,3,1,32,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,12574,10000,2,3,1,49,-1,-1,-2,-1,...,1690,1138,930,0,0,2828,0,182,0,1
2,29677,50000,1,2,1,28,-1,-1,-1,0,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,8857,80000,2,3,1,52,2,2,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,21099,270000,1,1,2,34,1,2,0,0,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [2]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          21000 non-null  int64
 1   LIMIT_BAL                   21000 non-null  int64
 2   SEX                         21000 non-null  int64
 3   EDUCATION                   21000 non-null  int64
 4   MARRIAGE                    21000 non-null  int64
 5   AGE                         21000 non-null  int64
 6   PAY_0                       21000 non-null  int64
 7   PAY_2                       21000 non-null  int64
 8   PAY_3                       21000 non-null  int64
 9   PAY_4                       21000 non-null  int64
 10  PAY_5                       21000 non-null  int64
 11  PAY_6                       21000 non-null  int64
 12  BILL_AMT1                   21000 non-null  int64
 13  BILL_AMT2                   21000 non-null  int64
 14  BILL_A

In [None]:
    # "feature_selection__k": [10, 15, 20],
    # "classifier__hidden_layer_sizes": [100, 200, 300],

    # "classifier__random_state": [10, 20, 30, 40, 50],
    # "feature_selection__k": [10, 15, 20],
    # "classifier__hidden_layer_sizes": [10, 50, 100, 200],
    # "classifier__alpha": [0.05, 0.1, 0.15, 0.20, 0.25, 0.3],
# Mejores hiperparametros: {'classifier__alpha': 0.1, 'classifier__hidden_layer_sizes': 200, 'classifier__random_state': 20, 'feature_selection__k': 10}
# score_test: 0.6627120425590184

    # "classifier__random_state": [20],
    # "feature_selection__k": [5, 10, 15, 20],
    # "classifier__hidden_layer_sizes": [100, 200, 300, 400, 500, 600],
    # "classifier__alpha": [0.1, 0.15, 0.20, 0.25],
# Mejores hiperparametros: {'classifier__alpha': 0.1, 'classifier__hidden_layer_sizes': 200, 'classifier__random_state': 20, 'feature_selection__k': 10}
# score_test: 0.6627120425590184

    # "classifier__random_state": [19, 20, 21],
    # "feature_selection__k": [9, 10, 11],
    # "classifier__hidden_layer_sizes": [190, 200, 210],
    # "classifier__alpha": [0.09, 0.1, .11],

    # "classifier__random_state": [17, 18, 19, 20],
    # "feature_selection__k": [20, 25],
    # "classifier__hidden_layer_sizes": [50, 100, 150],
    # "classifier__alpha": [0.25, 0.26, 0.27, 0.28, 0.29, 0.30],

    # "classifier__random_state": [20],
    # "feature_selection__k": [20],
    # "classifier__hidden_layer_sizes": [(50, 50, 50)],
    # "classifier__alpha": [0.01, 0.1, 0.3],
    # Mejores hiperparametros: {'classifier__alpha': 0.3, 'classifier__hidden_layer_sizes': (50, 50, 50), 'classifier__random_state': 20, 'feature_selection__k': 20}
    # score_test: 0.6709618653855483

    # "classifier__random_state": [15, 20, 25],
    # "feature_selection__k": [10, 15, 20, 25],
    # "classifier__hidden_layer_sizes": [(40, 40, 40, 40), (30, 30, 30, 30), (50, 40, 40, 30)],
    # "classifier__alpha": [0.1, 0.15, 0.20, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5],
# Mejores hiperparametros: {'classifier__alpha': 0.45, 'classifier__hidden_layer_sizes': (40, 40, 40, 40), 'classifier__random_state': 15, 'feature_selection__k': 10}
# score_test: 0.6738458207311578
#

    # "classifier__random_state": [15],
    # "feature_selection__k": [20],
    # "classifier__hidden_layer_sizes": [(40, 40, 40, 30)],
    # "classifier__alpha": [0.46],

    # "classifier__random_state": [15],
    # "feature_selection__k": [15],
    # # "classifier__hidden_layer_sizes": [(100, 80, 20, 40)],
    # "classifier__hidden_layer_sizes": [(50, 30, 40, 60)],
    # # "classifier__alpha": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    # # "classifier__alpha": [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
    # "classifier__alpha": [0.25],