In [51]:
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif

In [52]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".

def load_data(filename):
    df = pd.read_csv(f"../files/input/{filename}_data.csv.zip", index_col=False, compression='zip')
    data = df.drop(columns=["default payment next month"])
    target = df["default payment next month"]
    df = pd.concat([data, target], axis=1)
    return df

In [53]:
def clean_data(df):
    # - Renombre la columna "default payment next month" a "default".
    df = df.rename(columns={"default payment next month": "default"})

    # - Remueva la columna "ID".
    df = df.drop(columns=["ID"])

    # - Elimine los registros con informacion no disponible.
    df = df.dropna()

    # - Para la columna EDUCATION, valores > 4 indican niveles superiores de educación, agrupe estos valores en la categoría "others".
    df["EDUCATION"] = df["EDUCATION"].apply(lambda x: 4 if x > 4 else x)

    return df

In [54]:
df_train = load_data("train")
df_test = load_data("test")

In [55]:
df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [56]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.

In [57]:
def split_data(df):
    X = df.drop(columns=["default"])
    y = df["default"]
    return X, y

In [58]:
x_train, y_train = split_data(df_train)
x_test, y_test = split_data(df_test)

In [59]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Escala las demas variables al intervalo [0, 1].
# - Selecciona las K mejores caracteristicas.
# - Ajusta un modelo de regresion logistica.

In [60]:
def filter_columns(df):
    cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    num_features = df.columns.difference(cat_features) 
    return num_features, cat_features

In [61]:
def make_pipeline(numeric_features, categorical_features):
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical_features),
            ('scaler', MinMaxScaler(), numeric_features)
        ]
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_classif)),
        ('classifier', LogisticRegression(random_state=42))
    ])

    return pipeline

In [62]:
numeric_features_tr, categorical_features_tr = filter_columns(x_train)
pipeline_tr = make_pipeline(numeric_features_tr, categorical_features_tr)

numeric_features_ts, categorical_features_ts = filter_columns(x_train)
pipeline_ts = make_pipeline(numeric_features_ts, categorical_features_ts)

In [63]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

In [66]:
def make_grid_search(pipeline):
    param_grid = {
        'classifier__solver': ['liblinear'],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__max_iter': [100, 200, 300],
        'feature_selection__k': range(1, x_train.shape[1]+1)
    }

    scorer = make_scorer(balanced_accuracy_score)
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring=scorer, n_jobs=-1)
    return grid_search

In [67]:
grid_search_tr = make_grid_search(pipeline_tr)
grid_search_tr.fit(x_train, y_train)

In [82]:
grid_search_ts = make_grid_search(pipeline_ts)
grid_search_ts.fit(x_test, y_test)

In [69]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

In [70]:
def save_model(grid_search, filename):
    
    import gzip
    import pickle
    import os

    if not os.path.exists("../files/models"):
        os.makedirs("../files/models")
    with gzip.open(f"../files/models/{filename}.pkl.gz", "wb") as f:
        pickle.dump(grid_search, f)

In [71]:
save_model(grid_search_tr, "model")

In [72]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'metrics', 'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

In [73]:
def calculate_metrics(grid_search, x, y, filename):
    y_pred = grid_search.predict(x)

    metrics = {
        'type': 'metrics',        
        'dataset': filename,
        'precision': precision_score(y, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1_score': f1_score(y, y_pred)
    }

    return metrics

In [74]:
def save_metrics(metrics):

    import json
    import os

    output_dir = "../files/output"
    output_file = os.path.join(output_dir, "metrics.json")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            existing_data = [json.loads(line) for line in f]
    else:
        existing_data = []
    existing_data.extend(metrics)

    with open(output_file, "w") as f:
        for metric in existing_data:
            json.dump(metric, f)
            f.write("\n")

In [83]:
metrics_train = calculate_metrics(grid_search_tr, x_train, y_train, "train")
metrics_test = calculate_metrics(grid_search_ts, x_test, y_test, "test")
metrics = [metrics_train, metrics_test]

save_metrics(metrics)

In [76]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
#

In [77]:
def calculate_cm_matrix(grid_search, x, y, filename):

    y_pred = grid_search.predict(x)

    cm = confusion_matrix(y, y_pred)

    cm_matrix = {
        'type': 'cm_matrix',
        'dataset': filename,
        'true_0': {"predicted_0": int(cm[0][0]), "predicted_1": int(cm[0][1])},
        'true_1': {"predicted_0": int(cm[1][0]), "predicted_1": int(cm[1][1])}
    }

    return cm_matrix

In [84]:
cm_train = calculate_cm_matrix(grid_search_tr, x_train, y_train, "train")
cm_test = calculate_cm_matrix(grid_search_ts, x_test, y_test, "test")
cm = [cm_train, cm_test]

save_metrics(cm)