# Paso 1: Limpieza de los datos

## Cargar dataset

In [164]:
import pandas as pd

train_data = pd.read_csv("../files/input/train_data.csv.zip")
test_data = pd.read_csv("../files/input/test_data.csv.zip")

## Procesamiento de la data

### Renombrar columnas

In [165]:
train_data.rename(columns={"default payment next month": "default"}, inplace=True)
test_data.rename(columns={"default payment next month": "default"}, inplace=True)

### Remover columnas

In [166]:
train_data.drop(["ID"], axis=1, inplace=True)
test_data.drop(["ID"], axis=1, inplace=True)

### Eliminar datos faltantes

In [167]:
train_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)
test_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)

### Agrupar valores en una categoría

In [168]:
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

# Paso 2: División del dataset

In [169]:
x_train = train_data.drop("default", axis=1)
y_train = train_data["default"]

x_test = test_data.drop("default", axis=1)
y_test = test_data["default"]

# Paso 3: Creación del pipeline

- Transforma las variables categoricas usando el método one-hot-encoding.
- Descompone la matriz de entrada usando componentes principales. El pca usa todas las componentes.
- Escala la matriz de entrada al intervalo [0, 1].
- Selecciona las K columnas mas relevantes de la matrix de entrada.
- Ajusta una red neuronal tipo MLP.

### Transformadores

In [170]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

oneHotEncoder = OneHotEncoder()
scaler = StandardScaler()

### ColumnTransformer

In [171]:
import numpy as np
from sklearn.compose import ColumnTransformer

categorical_features=["SEX","EDUCATION","MARRIAGE"]
numerical_features=num_columns = [col for col in x_train.columns if col not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('oneHotEncoder', oneHotEncoder, categorical_features),
        ('scaler',scaler,numerical_features),
    ],
)

### Descomposición con PCA

In [172]:
from sklearn.decomposition import PCA

pca = PCA()

### Seleccionar K características

In [173]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selectKBest = SelectKBest(
    score_func=f_classif,
    # k="all",
)

### Red neuronal

In [174]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=15000, random_state=21)

## Pipeline

In [175]:
from sklearn.pipeline import Pipeline

estimators = [
    ("preprocessor", preprocessor), 
    ("kSelect", selectKBest),
    ("pca", pca),
    ("mlp", mlp),  
]

pipeline = Pipeline(
    steps=estimators,
    verbose=False,
)

# Paso 4: Optimizar hiperparámetros

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada. Use la función de precision
- balanceada para medir la precisión del modelo.

## GridSearchCV

In [176]:
param_grid = {
    # Ajustes para SelectKBest
    "kSelect__k": [20],  

    # Ajustes para pca
    "pca__n_components": [None], 

    # Ajustes para la red neuronal
    'mlp__hidden_layer_sizes': [(50,30,40,60)],  
    # 'mlp__activation': ['relu'], 
    # 'mlp__solver': ['adam'],  
    # 'mlp__learning_rate': ['adaptive'],  
    'mlp__learning_rate_init': [0.001],  
    'mlp__alpha': [0.26],   

}

In [177]:
from sklearn.model_selection import GridSearchCV

gridSearchCV=GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring="balanced_accuracy",
    n_jobs=-1,
    refit=True
    )

In [178]:
gridSearchCV.fit(x_train, y_train)

In [179]:
gridSearchCV.best_estimator_.named_steps['mlp']

In [180]:
gridSearchCV.best_score_

np.float64(0.6547749823553126)

In [181]:
gridSearchCV.best_params_

{'kSelect__k': 20,
 'mlp__alpha': 0.26,
 'mlp__hidden_layer_sizes': (50, 30, 40, 60),
 'mlp__learning_rate_init': 0.001,
 'pca__n_components': None}

In [182]:
train_score = gridSearchCV.score(x_train, y_train)
test_score = gridSearchCV.score(x_test, y_test)

print(f'Score en el conjunto de prueba: {train_score:.4f}')
print(f'Score en el conjunto de testing: {test_score:.4f}')

Score en el conjunto de prueba: 0.6629
Score en el conjunto de testing: 0.6685


# Paso 4.5: Evaluar métricas

In [183]:
SCORES = [
    0.661,
    0.666,
]

In [184]:
print("Válido TRAIN: ", train_score > SCORES[0])
print("Válido TEST: ", test_score > SCORES[1])

Válido TRAIN:  True
Válido TEST:  True


# Paso 5: Salvar el modelo

Salve el modelo como "files/models/model.pkl.gz".

In [185]:
model = gridSearchCV

In [186]:
import gzip
import pickle

# Guardar el modelo comprimido con gzip
model_filename = '../files/models/model.pkl.gz'

with gzip.open(model_filename, 'wb') as f:
    pickle.dump(model, f)

# Paso 6: Cálculo de métricas

- Calcule las metricas de precision, precision balanceada, recall, y f1-score para los conjuntos de entrenamiento y prueba.
- Guardelas en el archivo files/output/metrics.json. 

In [187]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

def calculate_metrics(model, X, y, dataset_name):
    y_pred = model.predict(X)
    
    precision = precision_score(y, y_pred)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    metrics = {
        'type': 'metrics',
        'dataset': dataset_name,
        'precision': precision,
        'balanced_accuracy': balanced_accuracy,
        'recall': recall,
        'f1_score': f1
    }
    return metrics


In [188]:
# Calcular métricas para el conjunto de entrenamiento y prueba
train_metrics = calculate_metrics(model, x_train, y_train, 'train')
test_metrics = calculate_metrics(model, x_test, y_test, 'test')

metrics = [train_metrics, test_metrics]

In [189]:
metrics

[{'type': 'metrics',
  'dataset': 'train',
  'precision': 0.6965626234689846,
  'balanced_accuracy': np.float64(0.6628980415325005),
  'recall': 0.37312169312169313,
  'f1_score': 0.48594266813671444},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.6795937211449676,
  'balanced_accuracy': np.float64(0.6685445991280557),
  'recall': 0.3861490031479538,
  'f1_score': 0.4924723987955838}]

In [190]:
import json

# Paso 6: Guardar las métricas en un archivo JSON
def save_metrics(metrics, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for metric in metrics:
            json.dump(metric, f)
            f.write('\n')

# Guardar las métricas calculadas
save_metrics(metrics)

# Paso 6: Matrices de confusión

In [191]:
from sklearn.metrics import confusion_matrix

# Paso 7: Calcular y guardar las matrices de confusión
def calculate_confusion_matrix(model, X, y, dataset_name):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_name,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }
    return cm_dict

In [192]:
# Calcular matrices de confusión para el conjunto de entrenamiento y prueba
train_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_train, y_train, 'train')
test_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_test, y_test, 'test')

cm_matrices = [train_cm, test_cm]

In [193]:
import json

# Paso 7: Guardar las matrices de confusión en el archivo JSON
def save_confusion_matrices(cm_matrices, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for cm in cm_matrices:
            json.dump(cm, f)
            f.write('\n')


# Guardar las matrices de confusión calculadas
save_confusion_matrices(cm_matrices)
