In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

In [2]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".
# - Renombre la columna "default payment next month" a "default"
# - Remueva la columna "ID".

def load_data(filename):
    df = pd.read_csv(f"../files/input/{filename}_data.csv.zip", index_col=False, compression='zip')
    data = df.drop(columns=["default payment next month"])
    target = df["default payment next month"]
    df = pd.concat([data, target], axis=1)
    return df

def clean_data(df):
    # - Renombre la columna "default payment next month" a "default".
    df = df.rename(columns={"default payment next month": "default"})

    # - Remueva la columna "ID".
    df = df.drop(columns=["ID"])

    # - Elimine los registros con informacion no disponible
    df = df[~df[['SEX', 'EDUCATION', 'MARRIAGE']].eq(0).any(axis=1)]

    # - Para la columna EDUCATION, valores > 4 indican niveles superiores de educación, agrupe estos valores en la categoría "others".
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)

    return df

In [3]:
df_train = load_data("train")
df_test = load_data("test")

In [4]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [5]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

def split_data(df):
    X = df.drop(columns=["default"])
    y = df["default"]
    return X, y

In [6]:
x_train, y_train = split_data(df_train)
x_test, y_test = split_data(df_test)

In [7]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando componentes principales.
#   El pca usa todas las componentes.
# - Escala la matriz de entrada al intervalo [0, 1].
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una red neuronal tipo MLP.

def make_pipeline():
    cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), cat_features),
        ],
        remainder=StandardScaler()
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('pca', PCA()), 
        ('feature_selection', SelectKBest(f_classif)), 
        ('classifier', MLPClassifier(random_state=16, early_stopping=True))   
    ])
    
    return pipeline


In [8]:
pipeline_tr = make_pipeline()
pipeline_ts = make_pipeline()

In [9]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

def make_grid_search(pipeline):
    param_grid = {
        'feature_selection__k': range(1, x_train.shape[1] + 1), 
        'classifier__alpha': [0.1, 0.25, 0.5],
    }

    stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=16)

    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=stratified_kfold, 
        scoring='balanced_accuracy', 
        n_jobs=-1
    )
    
    return grid_search

In [10]:
model_tr = make_grid_search(pipeline_tr)
model_tr.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [11]:
print("Mejor modelo encontrado:")
print(model_tr.best_params_)
print(model_tr.best_score_)
print(model_tr.best_estimator_)

Mejor modelo encontrado:
{'classifier__alpha': 0.25, 'feature_selection__k': 14}
0.6584527656724101
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder=StandardScaler(),
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE'])])),
                ('pca', PCA()), ('feature_selection', SelectKBest(k=14)),
                ('classifier',
                 MLPClassifier(alpha=0.25, early_stopping=True,
                               random_state=16))])


In [12]:
model_ts = make_grid_search(pipeline_ts)
model_ts.fit(x_test, y_test)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
print("Mejor modelo encontrado:")
print(model_ts.best_params_)
print(model_ts.best_score_)
print(model_ts.best_estimator_)

Mejor modelo encontrado:
{'classifier__alpha': 0.5, 'feature_selection__k': 22}
0.6460462227397457
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder=StandardScaler(),
                                   transformers=[('cat', OneHotEncoder(),
                                                  ['SEX', 'EDUCATION',
                                                   'MARRIAGE'])])),
                ('pca', PCA()), ('feature_selection', SelectKBest(k=22)),
                ('classifier',
                 MLPClassifier(alpha=0.5, early_stopping=True,
                               random_state=16))])


In [14]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

def save_model(grid_search, filename):
    
    import gzip
    import pickle
    import os

    if not os.path.exists("../files/models"):
        os.makedirs("../files/models")
    with gzip.open(f"../files/models/{filename}.pkl.gz", "wb") as f:
        pickle.dump(grid_search, f)

In [15]:
save_model(model_tr, "model")

In [16]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

def calculate_metrics(grid_search, x, y, filename):
    y_pred = grid_search.predict(x)

    metrics = {
        'type': 'metrics',        
        'dataset': filename,
        'precision': precision_score(y, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1_score': f1_score(y, y_pred)
    }

    return metrics

def save_metrics(metrics):

    import json
    import os

    output_dir = "../files/output"
    output_file = os.path.join(output_dir, "metrics.json")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            existing_data = [json.loads(line) for line in f]
    else:
        existing_data = []
    existing_data.extend(metrics)

    with open(output_file, "w") as f:
        for metric in existing_data:
            json.dump(metric, f)
            f.write("\n")

In [17]:
metrics_train = calculate_metrics(model_tr, x_train, y_train, "train")
metrics_test = calculate_metrics(model_ts, x_test, y_test, "test")
metrics = [metrics_train, metrics_test]

save_metrics(metrics)

In [18]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}

def calculate_cm_matrix(grid_search, x, y, filename):

    y_pred = grid_search.predict(x)

    cm = confusion_matrix(y, y_pred)

    cm_matrix = {
        'type': 'cm_matrix',
        'dataset': filename,
        'true_0': {"predicted_0": int(cm[0][0]), "predicted_1": int(cm[0][1])},
        'true_1': {"predicted_0": int(cm[1][0]), "predicted_1": int(cm[1][1])}
    }

    return cm_matrix

In [19]:
cm_train = calculate_cm_matrix(model_tr, x_train, y_train, "train")
cm_test = calculate_cm_matrix(model_ts, x_test, y_test, "test")
cm = [cm_train, cm_test]

save_metrics(cm)