#### Paso 1

In [1]:
import pandas as pd

data_train = pd.read_csv('../files/input/train_data.csv.zip', index_col = False, compression = "zip")
data_test = pd.read_csv("../files/input/test_data.csv.zip", index_col = False, compression = "zip")

In [2]:
import numpy as np

data_train.rename(columns={"default payment next month": "default"}, inplace=True)
data_test.rename(columns={"default payment next month": "default"}, inplace=True)

data_train.drop(columns=['ID'], inplace=True)
data_test.drop(columns=['ID'], inplace=True)

data_train['EDUCATION'] = data_train['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
data_test['EDUCATION'] = data_test['EDUCATION'].apply(lambda x: 4 if x > 4 else x)

data_train['EDUCATION'] = data_train['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)
data_test['EDUCATION'] = data_test['EDUCATION'].apply(lambda x: x if x > 0 else np.nan)

data_train['MARRIAGE'] = data_train['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)
data_test['MARRIAGE'] = data_test['MARRIAGE'].apply(lambda x: x if x > 0 else np.nan)

data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

#### Paso 2

In [4]:
x_train = data_train.drop(columns=['default'])
y_train = data_train["default"]
x_test = data_test.drop(columns=['default'])
y_test = data_test["default"]

#### Paso 3

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough',  # Mantiene el resto de las columnas sin modificar
)

clf = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', clf)
])

#### Paso 4

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

  # Hiperparámetros para GridSearch

param_grid = {
  'clf__n_estimators': [200], # Número de árboles
  'clf__max_depth': [45],  # Profundidad máxima de cada árbol
  'clf__min_samples_split': [8],  # Número mínimo de muestras que un nodo debe tener para poder dividirse
  #'clf__class_weight': ['balanced'],
  'clf__max_features': ['sqrt'] # Número máximo de características a considerar al buscar la mejor división para un nodo
}

model = GridSearchCV(
    pipeline, param_grid, cv=10, scoring='balanced_accuracy', n_jobs=-1)

model.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



#### Paso 6

In [None]:
import os
import pickle
import gzip

dir_path = '../files/models'

if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
        pickle.dump(model, f)
else:
    with gzip.open('../files/models/model.pkl.gz', 'wb') as f:
        pickle.dump(model, f)

#### Paso 6

In [None]:
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import json

y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

train_metrics = {
    "type": "metrics",
    'dataset': 'train',
    'precision': precision_score(y_train, y_train_pred),
    'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
    'recall': recall_score(y_train, y_train_pred),
    'f1_score': f1_score(y_train, y_train_pred)
}


test_metrics = {
    "type": "metrics", 
    'dataset': 'test',
    'precision': precision_score(y_test, y_test_pred),
    'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
    'recall': recall_score(y_test, y_test_pred),
    'f1_score': f1_score(y_test, y_test_pred)
}

output_path = '../files/output/metrics.json'
os.makedirs(os.path.dirname(output_path), exist_ok=True) 

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(train_metrics, f, ensure_ascii=False) 
    f.write('\n')
    json.dump(test_metrics, f, ensure_ascii=False) 
    f.write('\n')

#### Paso 7

In [9]:
from sklearn.metrics import confusion_matrix

train_cm = confusion_matrix(y_train, y_train_pred)
test_cm = confusion_matrix(y_test, y_test_pred)

train_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'train',
    'true_0': {'predicted_0': int(train_cm[0, 0]), 'predicted_1': int(train_cm[0, 1])},
    'true_1': {'predicted_0': int(train_cm[1, 0]), 'predicted_1': int(train_cm[1, 1])}
}

test_cm_dict = {
    'type': 'cm_matrix',
    'dataset': 'test',
    'true_0': {'predicted_0': int(test_cm[0, 0]), 'predicted_1': int(test_cm[0, 1])},
    'true_1': {'predicted_0': int(test_cm[1, 0]), 'predicted_1': int(test_cm[1, 1])}
}

output_path = '../files/output/metrics.json'

with open(output_path, 'a', encoding='utf-8') as f:
    json.dump(train_cm_dict, f, ensure_ascii=False) 
    f.write('\n')
    json.dump(test_cm_dict, f, ensure_ascii=False)  
    f.write('\n')