# Paso 1: Limpieza de los datos

## Cargar dataset

In [1]:
import pandas as pd

train_data = pd.read_csv("../files/input/train_data.csv.zip")
test_data = pd.read_csv("../files/input/test_data.csv.zip")

## Procesamiento de la data

### Renombrar columnas

In [2]:
train_data.rename(columns={"default payment next month": "default"}, inplace=True)
test_data.rename(columns={"default payment next month": "default"}, inplace=True)

### Remover columna

In [3]:
train_data.drop(["ID"], axis=1, inplace=True)
test_data.drop(["ID"], axis=1, inplace=True)

### Eliminar datos faltantes

In [4]:
train_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)
test_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)

### Agrupar valores en una categoria

In [5]:
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

# Paso 2: División el dataset

In [6]:
x_train = train_data.drop("default", axis=1)
y_train = train_data["default"]

x_test = test_data.drop("default", axis=1)
y_test = test_data["default"]

# Paso 3: Creación del pipeline 

- Transforma las variables categoricas usando el método one-hot-encoding.
- Escala las demas variables al intervalo [0, 1].
- Selecciona las K mejores caracteristicas.
- Ajusta un modelo de regresion logistica.

### Transformadores

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

oneHotEncoder = OneHotEncoder()
scaler = MinMaxScaler()

#### ColumnTransformer

In [8]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

column_trans = ColumnTransformer(
    transformers=[
        ("scaler", scaler, make_column_selector(dtype_include=np.number)),
        ("oneHotEncoder", oneHotEncoder, make_column_selector(dtype_include=object)),
    ],
    remainder="passthrough",
)

### Seleccionar K características

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selectKBest = SelectKBest(
    score_func=f_classif,
    k=10,
)

### Modelo de Regresión logística

In [10]:
from sklearn.linear_model import LogisticRegression

logisticRegression = LogisticRegression()

## Pipeline

In [11]:
from sklearn.pipeline import Pipeline

estimators = [
    ("colTrans", column_trans),
    ("kSelect", selectKBest),
    ("lr", logisticRegression),
]

pipeline = Pipeline(
    steps=estimators,
    verbose=False,
)

# Paso 4: Optimizar hiperparámetros

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada. Use la función de precision
- balanceada para medir la precisión del modelo.

### Iterador de VC

In [12]:
from sklearn.model_selection import StratifiedKFold

stratifiedKFold = StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=42,
)

## GridSearchCV

In [13]:
# from sklearn.model_selection import ParameterGrid

# param_grid = {
#     # Ajustes para SelectKBest
#     "kSelect__k": range(1, len(x_train.columns) + 1),
    
#     # Parámetros de Regresión Logística
#     'lr__C': [0.1,1,10],
#     'lr__solver': ["lbfgs", "liblinear"],
#     'lr__max_iter': [200,1000],
#     'lr__fit_intercept': [ False, True],
#     'lr__tol' : [0.001, 0.01],
#     'lr__penalty': ['l1', 'l2' ], 
# }


# # list(ParameterGrid(param_grid))

In [14]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    # Ajustes para SelectKBest
    "kSelect__k": range(1, len(x_train.columns) + 1),
    
    # Parámetros de Regresión Logística
    'lr__C': [0.1,1,10],
    'lr__solver': ["liblinear"],
    'lr__max_iter': [200],
    'lr__tol' : [0.01],
    'lr__penalty': ['l1'], 
}


# list(ParameterGrid(param_grid))

In [15]:
from sklearn.model_selection import GridSearchCV

gridSearchCV = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=stratifiedKFold,
    scoring="balanced_accuracy",
    # refit=True,
    n_jobs=-1,
)

In [16]:
gridSearchCV.fit(x_train, y_train)

In [17]:
gridSearchCV.best_estimator_.named_steps['lr']

In [18]:
gridSearchCV.best_score_

np.float64(0.6392679515991866)

In [19]:
gridSearchCV.best_params_

{'kSelect__k': 1,
 'lr__C': 1,
 'lr__max_iter': 200,
 'lr__penalty': 'l1',
 'lr__solver': 'liblinear',
 'lr__tol': 0.01}

In [20]:
train_score = gridSearchCV.score(x_train, y_train)
test_score = gridSearchCV.score(x_test, y_test)

print(f'Score en el conjunto de prueba: {train_score:.4f}')
print(f'Score en el conjunto de testing: {test_score:.4f}')

Score en el conjunto de prueba: 0.6393
Score en el conjunto de testing: 0.6547


In [21]:
gridSearchCV.cv_results_

{'mean_fit_time': array([0.11348515, 0.12077227, 0.13792863, 0.15540287, 0.16485982,
        0.14295638, 0.16005223, 0.16203442, 0.14894445, 0.14800487,
        0.17232246, 0.15779474, 0.18031878, 0.16585367, 0.18184061,
        0.18726187, 0.18268464, 0.19294336, 0.20499966, 0.21501987,
        0.21365726, 0.19752457, 0.22712882, 0.2186419 , 0.20698318,
        0.24689615, 0.23523681, 0.210306  , 0.25499794, 0.24270632,
        0.21153698, 0.26321435, 0.27632191, 0.22134323, 0.27048194,
        0.26577399, 0.2106276 , 0.27786827, 0.27874048, 0.22123952,
        0.30089755, 0.29114704, 0.24766102, 0.31572692, 0.30644226,
        0.26064565, 0.32082314, 0.31723843, 0.28840408, 0.35046232,
        0.37583516, 0.3049623 , 0.39075735, 0.37273593, 0.30255249,
        0.37473388, 0.39467947, 0.32537887, 0.41690555, 0.397895  ,
        0.35114601, 0.41551561, 0.4169488 , 0.36656628, 0.43412538,
        0.47106254, 0.39850769, 0.43139381, 0.3528429 ]),
 'std_fit_time': array([0.02339776, 0.013

# Paso 5: Salvar el modelo

Salve el modelo como "files/models/model.pkl.gz".

In [22]:
model = gridSearchCV

In [23]:
# import pickle

# with open("../files/models/model.pkl", "wb") as file:
#         pickle.dump(model, file)

In [25]:
import gzip
import pickle

# Guardar el modelo comprimido con gzip
model_filename = '../files/models/model.pkl.gz'

with gzip.open(model_filename, 'wb') as f:
    pickle.dump(model, f)

# Paso 6: Cálculo de métricas

- Calcule las metricas de precision, precision balanceada, recall, y f1-score para los conjuntos de entrenamiento y prueba.
- Guardelas en el archivo files/output/metrics.json. 

Ejemplo
```json
{'type': 'metrics', 'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}
```

In [26]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

def calculate_metrics(model, X, y, dataset_name):
    y_pred = model.predict(X)
    
    precision = precision_score(y, y_pred)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    metrics = {
        'type': 'metrics',
        'dataset': dataset_name,
        'precision': precision,
        'balanced_accuracy': balanced_accuracy,
        'recall': recall,
        'f1_score': f1
    }
    return metrics


In [27]:
# Calcular métricas para el conjunto de entrenamiento y prueba
train_metrics = calculate_metrics(model, x_train, y_train, 'train')
test_metrics = calculate_metrics(model, x_test, y_test, 'test')

metrics = [train_metrics, test_metrics]

In [28]:
metrics

[{'type': 'metrics',
  'dataset': 'train',
  'precision': 0.6939338235294118,
  'balanced_accuracy': np.float64(0.6392682710528409),
  'recall': 0.31957671957671957,
  'f1_score': 0.43761773655991887},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.7017913593256059,
  'balanced_accuracy': np.float64(0.6547057822566611),
  'recall': 0.34942287513116477,
  'f1_score': 0.46654991243432575}]

In [29]:
import json

# Paso 6: Guardar las métricas en un archivo JSON
def save_metrics(metrics, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for metric in metrics:
            json.dump(metric, f)
            f.write('\n')

# Guardar las métricas calculadas
save_metrics(metrics)

# Paso 7: Matrices de confusión

Calcule las matrices de confusion para los conjuntos de entrenamiento y prueba. Guardelas en el archivo files/output/metrics.json.

```json
{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
```

In [30]:
from sklearn.metrics import confusion_matrix

# Paso 7: Calcular y guardar las matrices de confusión
def calculate_confusion_matrix(model, X, y, dataset_name):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_name,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }
    return cm_dict

In [31]:
# Calcular matrices de confusión para el conjunto de entrenamiento y prueba
train_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_train, y_train, 'train')
test_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_test, y_test, 'test')

cm_matrices = [train_cm, test_cm]

In [32]:
import json

# Paso 7: Guardar las matrices de confusión en el archivo JSON
def save_confusion_matrices(cm_matrices, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for cm in cm_matrices:
            json.dump(cm, f)
            f.write('\n')


# Guardar las matrices de confusión calculadas
save_confusion_matrices(cm_matrices)
