In [1]:
#CARGA DE LIBRERIAS
import pandas as pd
import pickle
import gzip
import os
import json
import numpy as np 
import time
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [2]:
#LECTURA DE DATOS
train_data_zip = '../files/input/train_data.csv.zip'
test_data_zip = '../files/input/test_data.csv.zip'

In [3]:
#DESCOMPRIMIR ARCHIVOS 
train_data = pd.read_csv(train_data_zip, index_col=False, compression='zip')
test_data = pd.read_csv(test_data_zip, index_col=False, compression='zip')

In [4]:
# Paso 1.
# Realice la limpieza de los datasets:
# - Renombre la columna "default payment next month" a "default".
# - Remueva la columna "ID".
# - Elimine los registros con informacion no disponible.
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".

def clean_data(data_df):
    
    df=data_df.copy()
    df=df.rename(columns={'default payment next month': 'default'})
    df=df.drop(columns='ID')
    df['EDUCATION'] = df['EDUCATION'].replace(0, np.nan)
    df['MARRIAGE'] = df['MARRIAGE'].replace(0, np.nan)
    df=df.dropna()
    df.loc[df['EDUCATION'] > 4, 'EDUCATION'] = 4
    return df

In [5]:
train_data = clean_data(train_data)
test_data = clean_data(test_data)

In [6]:
# Paso 2.
# Divida los datasets en x_train, y_train, x_test, y_test.
x_train = train_data.drop(columns='default', axis=1)
y_train = train_data['default']

x_test = test_data.drop(columns='default', axis=1)
y_test = test_data['default']

In [9]:
# Paso 3.
# Cree un pipeline para el modelo de clasificación. Este pipeline debe
# contener las siguientes capas:
# - Transforma las variables categoricas usando el método
#   one-hot-encoding.
# - Descompone la matriz de entrada usando PCA. El PCA usa todas las componentes.
# - Estandariza la matriz de entrada.
# - Selecciona las K columnas mas relevantes de la matrix de entrada.
# - Ajusta una maquina de vectores de soporte (svm).

def create_pipeline(df):
    cat_features = ['SEX', 'EDUCATION', 'MARRIAGE']
    num_features = [col for col in df.columns if col not in cat_features]
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),   
            ('num', StandardScaler(), num_features), 
        ]
    )

    pipeline = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('pca', PCA()),
            ('select_k_best', SelectKBest(f_classif)),
            ('model', SVC()),
        ]
    )

    return pipeline

In [10]:
pipeline = create_pipeline(x_train)

In [11]:
# Paso 4.
# Optimice los hiperparametros del pipeline usando validación cruzada.
# Use 10 splits para la validación cruzada. Use la función de precision
# balanceada para medir la precisión del modelo.

def optimize_hyperparameters(pipeline, x_train, y_train):
    param_grid = {
        'pca__n_components': [21],
        'select_k_best__k': [12],
        'model__C': [0.8],
        'model__kernel': ['rbf'],
         'model__gamma': [0.1],
    }
    grid_search=GridSearchCV(pipeline, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1, verbose=2 )
    grid_search.fit(x_train, y_train)
    return grid_search

In [12]:
# Optimizar los hiperparametros
start = time.time()
model = optimize_hyperparameters(pipeline, x_train, y_train)
end = time.time()
print(f'Tiempo de optimizacion parametros: {end - start:.2f} seconds')
print(model.best_params_)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Tiempo de optimizacion parametros: 87.73 seconds
{'model__C': 0.8, 'model__gamma': 0.1, 'model__kernel': 'rbf', 'pca__n_components': 21, 'select_k_best__k': 12}


In [13]:
# Paso 5.
# Guarde el modelo (comprimido con gzip) como "files/models/model.pkl.gz".
# Recuerde que es posible guardar el modelo comprimido usanzo la libreria gzip.

def save_model(model):
    
    if not os.path.exists('../files/models'):
        os.makedirs('../files/models')
    
    with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
        pickle.dump(model, file)

In [14]:
save_model(model)

In [15]:
# Paso 6.
# Calcule las metricas de precision, precision balanceada, recall,
# y f1-score para los conjuntos de entrenamiento y prueba.
# Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# Este diccionario tiene un campo para indicar si es el conjunto
# de entrenamiento o prueba. Por ejemplo:
#
# {'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
# {'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}

def calculate_metrics(model, x_train, y_train, x_test, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'precision': float(round(precision_score(y_train, y_train_pred),4)),
        'balanced_accuracy': float(round(balanced_accuracy_score(y_train, y_train_pred),4)),
        'recall': float(round(recall_score(y_train, y_train_pred),4)),
        'f1_score': float(round(f1_score(y_train, y_train_pred),4))
    }

    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'precision': float(round(precision_score(y_test, y_test_pred),4)),
        'balanced_accuracy': float(round(balanced_accuracy_score(y_test, y_test_pred),4)),
        'recall': float(round(recall_score(y_test, y_test_pred),4)),
        'f1_score': float(round(f1_score(y_test, y_test_pred),4))
    }

    print(metrics_train)
    print(metrics_test)

    return metrics_train, metrics_test

In [16]:
metrics_train, metrics_test = calculate_metrics(model, x_train, y_train, x_test, y_test)

{'type': 'metrics', 'dataset': 'train', 'precision': 0.7006, 'balanced_accuracy': 0.6624, 'recall': 0.371, 'f1_score': 0.4851}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.675, 'balanced_accuracy': 0.6669, 'recall': 0.3835, 'f1_score': 0.4891}


In [17]:
# Paso 7.
# Calcule las matrices de confusion para los conjuntos de entrenamiento y
# prueba. Guardelas en el archivo files/output/metrics.json. Cada fila
# del archivo es un diccionario con las metricas de un modelo.
# de entrenamiento o prueba. Por ejemplo:
#
# {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
# {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
#
def calculate_confusion_matrix(model, x_train, y_train, x_test, y_test):
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    cm_matrix_train = {
        'type': 'cm_matrix',
        'dataset': 'train',
        'true_0': {"predicted_0": int(cm_train[0, 0]), "predicted_1": int(cm_train[0, 1])},
        'true_1': {"predicted_0": int(cm_train[1, 0]), "predicted_1": int(cm_train[1, 1])}
    }

    cm_matrix_test = {
        'type': 'cm_matrix',
        'dataset': 'test',
        'true_0': {"predicted_0": int(cm_test[0, 0]), "predicted_1": int(cm_test[0, 1])},
        'true_1': {"predicted_0": int(cm_test[1, 0]), "predicted_1": int(cm_test[1, 1])}
    }

    return cm_matrix_train, cm_matrix_test

In [18]:
cm_matrix_train, cm_matrix_test = calculate_confusion_matrix(model, x_train, y_train, x_test, y_test)

In [19]:
cm_matrix_train

{'type': 'cm_matrix',
 'dataset': 'train',
 'true_0': {'predicted_0': 15479, 'predicted_1': 749},
 'true_1': {'predicted_0': 2972, 'predicted_1': 1753}}

In [20]:
#guardar los parametros en carpeta output
def save_metrics(metrics_train, metrics_test, cm_matrix_train, cm_matrix_test):
    
    if not os.path.exists('../files/output'):
        os.makedirs('../files/output')
    
    metrics = [metrics_train, metrics_test, cm_matrix_train, cm_matrix_test]
    pd.DataFrame(metrics).to_json('../files/output/metrics.json', orient='records', lines=True)


In [21]:
save_metrics(metrics_train, metrics_test, cm_matrix_train, cm_matrix_test)