# Paso 1: Limpieza de los datos

## Cargar dataset

In [1]:
import pandas as pd

train_data = pd.read_csv("../files/input/train_data.csv.zip")
test_data = pd.read_csv("../files/input/test_data.csv.zip")

## Procesamiento de la data

### Renombrar columnas

In [2]:
train_data.rename(columns={"default payment next month": "default"}, inplace=True)
test_data.rename(columns={"default payment next month": "default"}, inplace=True)

### Remover columna

In [3]:
train_data.drop(["ID"], axis=1, inplace=True)
test_data.drop(["ID"], axis=1, inplace=True)

### Eliminar datos faltantes

In [4]:
train_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)
test_data.query('MARRIAGE != 0 and EDUCATION != 0', inplace= True)

### Agrupar valores en una categoria

In [5]:
train_data['EDUCATION'] = train_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
test_data['EDUCATION'] = test_data['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

# Paso 2: División el dataset

In [6]:
x_train = train_data.drop("default", axis=1)
y_train = train_data["default"]

x_test = test_data.drop("default", axis=1)
y_test = test_data["default"]

# Paso 3: Creación del pipeline 

- Transforma las variables categoricas usando el método one-hot-encoding.
- Escala las demas variables al intervalo [0, 1].
- Selecciona las K mejores caracteristicas.
- Ajusta un modelo de regresion logistica.

### Transformadores

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

oneHotEncoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

#### ColumnTransformer

In [8]:
import numpy as np
from sklearn.compose import ColumnTransformer

categorical_feature=['EDUCATION','SEX','MARRIAGE']


column_trans = ColumnTransformer(
    transformers=[
        ("oneHotEncoder", oneHotEncoder, categorical_feature),
    ],
    remainder="passthrough",
)

### Descomposición con PCA

In [9]:
from sklearn.decomposition import PCA

pca = PCA(random_state=42)

### Seleccionar K características

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selectKBest = SelectKBest(
    score_func=f_classif,
    k="all",
)

### Modelo de Regresión logística

In [11]:
from sklearn.svm import SVC

svm = SVC(random_state=42)

## Pipeline

In [12]:
from sklearn.pipeline import Pipeline

estimators = [
    ("colTrans", column_trans),
    ("pca", pca), 
    ("scaler", scaler),
    ("kSelect", selectKBest),
    ("svm", svm),  
]

pipeline = Pipeline(
    steps=estimators,
    verbose=False,
)

# Paso 4: Optimizar hiperparámetros

- Optimice los hiperparametros del pipeline usando validación cruzada.
- Use 10 splits para la validación cruzada. Use la función de precision
- balanceada para medir la precisión del modelo.

### Iterador de VC

In [13]:
from sklearn.model_selection import StratifiedKFold

stratifiedKFold = StratifiedKFold(
    n_splits=10,
    shuffle=False,
    # random_state=42,
)

## GridSearchCV

In [14]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    # Ajustes para SelectKBest
    "kSelect__k": [4], 
    
    # Hiperparámetros para el modelo SVM
    "svm__C": [10, 13.5],           # Regularización
    "svm__kernel": ["linear", "rbf"],  # Tipos de kernel
    'svm__degree': [2,3],
    "svm__gamma": ["auto"],   # Parámetro gamma para el kernel RBF
}


# list(ParameterGrid(param_grid))

In [15]:
from sklearn.model_selection import GridSearchCV

gridSearchCV = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring="balanced_accuracy",
    # refit=True,
    n_jobs=-1,
)

In [16]:
gridSearchCV.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [17]:
gridSearchCV.best_estimator_.named_steps['svm']

In [18]:
gridSearchCV.best_score_

np.float64(0.65306032613731)

In [19]:
gridSearchCV.best_params_

{'kSelect__k': 4,
 'svm__C': 13.5,
 'svm__degree': 2,
 'svm__gamma': 'auto',
 'svm__kernel': 'rbf'}

In [20]:
train_score = gridSearchCV.score(x_train, y_train)
test_score = gridSearchCV.score(x_test, y_test)

print(f'Score en el conjunto de prueba: {train_score:.4f}')
print(f'Score en el conjunto de testing: {test_score:.4f}')

Score en el conjunto de prueba: 0.6613
Score en el conjunto de testing: 0.6667


# Paso 5: Salvar el modelo

Salve el modelo como "files/models/model.pkl.gz".

In [21]:
model = gridSearchCV

In [22]:
# import pickle

# with open("../files/models/model.pkl", "wb") as file:
#         pickle.dump(model, file)

In [23]:
import gzip
import pickle

# Guardar el modelo comprimido con gzip
model_filename = '../files/models/model.pkl.gz'

with gzip.open(model_filename, 'wb') as f:
    pickle.dump(model, f)

# Paso 6: Cálculo de métricas

- Calcule las metricas de precision, precision balanceada, recall, y f1-score para los conjuntos de entrenamiento y prueba.
- Guardelas en el archivo files/output/metrics.json. 

Ejemplo
```json
{'type': 'metrics', 'dataset': 'train', 'precision': 0.8, 'balanced_accuracy': 0.7, 'recall': 0.9, 'f1_score': 0.85}
{'type': 'metrics', 'dataset': 'test', 'precision': 0.7, 'balanced_accuracy': 0.6, 'recall': 0.8, 'f1_score': 0.75}
```

In [24]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

def calculate_metrics(model, X, y, dataset_name):
    y_pred = model.predict(X)
    
    precision = precision_score(y, y_pred)
    balanced_accuracy = balanced_accuracy_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    
    metrics = {
        'type': 'metrics',
        'dataset': dataset_name,
        'precision': precision,
        'balanced_accuracy': balanced_accuracy,
        'recall': recall,
        'f1_score': f1
    }
    return metrics


In [25]:
# Calcular métricas para el conjunto de entrenamiento y prueba
train_metrics = calculate_metrics(model, x_train, y_train, 'train')
test_metrics = calculate_metrics(model, x_test, y_test, 'test')

metrics = [train_metrics, test_metrics]

In [26]:
metrics

[{'type': 'metrics',
  'dataset': 'train',
  'precision': 0.692094861660079,
  'balanced_accuracy': np.float64(0.6612892798781387),
  'recall': 0.37058201058201057,
  'f1_score': 0.4827015851137147},
 {'type': 'metrics',
  'dataset': 'test',
  'precision': 0.6737327188940092,
  'balanced_accuracy': np.float64(0.6667381121682754),
  'recall': 0.38352570828961174,
  'f1_score': 0.4887997325309261}]

In [27]:
import json

# Paso 6: Guardar las métricas en un archivo JSON
def save_metrics(metrics, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for metric in metrics:
            json.dump(metric, f)
            f.write('\n')

# Guardar las métricas calculadas
save_metrics(metrics)

# Paso 7: Matrices de confusión

Calcule las matrices de confusion para los conjuntos de entrenamiento y prueba. Guardelas en el archivo files/output/metrics.json.

```json
{'type': 'cm_matrix', 'dataset': 'train', 'true_0': {"predicted_0": 15562, "predicte_1": 666}, 'true_1': {"predicted_0": 3333, "predicted_1": 1444}}
{'type': 'cm_matrix', 'dataset': 'test', 'true_0': {"predicted_0": 15562, "predicte_1": 650}, 'true_1': {"predicted_0": 2490, "predicted_1": 1420}}
```

In [28]:
from sklearn.metrics import confusion_matrix

# Paso 7: Calcular y guardar las matrices de confusión
def calculate_confusion_matrix(model, X, y, dataset_name):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    cm_dict = {
        'type': 'cm_matrix',
        'dataset': dataset_name,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }
    return cm_dict

In [29]:
# Calcular matrices de confusión para el conjunto de entrenamiento y prueba
train_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_train, y_train, 'train')
test_cm = calculate_confusion_matrix(gridSearchCV.best_estimator_, x_test, y_test, 'test')

cm_matrices = [train_cm, test_cm]

In [30]:
import json

# Paso 7: Guardar las matrices de confusión en el archivo JSON
def save_confusion_matrices(cm_matrices, filename='../files/output/metrics.json'):
    with open(filename, 'a') as f:
        for cm in cm_matrices:
            json.dump(cm, f)
            f.write('\n')


# Guardar las matrices de confusión calculadas
save_confusion_matrices(cm_matrices)
