In [None]:
# flake8: noqa: E501
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
import pickle
import json

# Paso 1: Limpieza de los datasets
def preprocess(df):
    df.rename(columns={'default payment next month': 'default'}, inplace=True)
    df.drop(columns=['ID'], inplace=True)
    df.dropna(inplace=True)
    df['EDUCATION'] = df['EDUCATION'].apply(lambda x: 4 if x > 4 else x)
    df['EDUCATION'] = df['EDUCATION'].astype('int64')
    return df

train_pre = pd.read_csv('../files/input/train_data.csv.zip', compression='zip', index_col=None)
test_pre = pd.read_csv('../files/input/test_data.csv.zip', compression='zip', index_col=None)

train = preprocess(train_pre)
test = preprocess(test_pre)

# Paso 2: División de los datasets
x_train = train.drop(columns=['default'])
y_train = train['default']
x_test = test.drop(columns=['default'])
y_test = test['default']

# Paso 3: Crear pipeline
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
# categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
numerical_features = list(set(x_train.columns) - set(categorical_features))

preprocessor = ColumnTransformer(
transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', MinMaxScaler(), numerical_features)
]
)
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("selector", SelectKBest(f_classif)),
        ("classifier", LogisticRegression())
    ]
)

# param_grid = {
#     'feature_selection__k': [5, 10, 15],  # Selección del número de características a retener
#     'classifier__C': [0.0001, 0.01, 0.1, 1, 10, 100, 10000],  # Regularización de la regresión logística (parámetro C)
#     'classifier__solver': ['liblinear', 'saga']  # Métodos de optimización para la regresión logística
# }

param_grid = {
        "selector__k": [5, 10, 15],
        "classifier__C": [0.1, 1, 10],
        'classifier__solver': ['liblinear', 'saga']
    }

# grid_search = GridSearchCV(
#     pipeline,
#     param_grid,
#     scoring='balanced_accuracy',
#     cv=10,
#     n_jobs=-1,
#     verbose=2
# )
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="balanced_accuracy")

grid_search.fit(x_train, y_train)

# Paso 5: Guardar el modelo
os.makedirs("../files/models", exist_ok=True)
with open("../files/models/model.pkl", "wb") as file:
    pickle.dump(grid_search, file)

# Paso 6 y 7: Calcular métricas y guardar resultados
def calculate_metrics(y_true, y_pred, dataset_name):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true, y_pred)
    return {
        "type": "metrics",
        "dataset": dataset_name,
        "precision": float(precision),
        "balanced_accuracy": float(balanced_accuracy),
        "recall": float(recall),
        "f1_score": float(f1)
    }

def calculate_cm(y_true, y_pred, dataset_name):
    cm = confusion_matrix(y_true, y_pred)
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        "true_1": {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }

# Predicciones
train_preds = grid_search.best_estimator_.predict(x_train)
test_preds = grid_search.best_estimator_.predict(x_test)

metrics = [
    calculate_metrics(y_train, train_preds, "train"),
    calculate_metrics(y_test, test_preds, "test"),
    calculate_cm(y_train, train_preds, "train"),
    calculate_cm(y_test, test_preds, "test")
]

# Guardar métricas y matrices de confusión en JSON
os.makedirs('../files/output', exist_ok=True)
output_file = '../files/output/metrics.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(metrics, f, indent=4)

print(f"Métricas y matrices de confusión guardadas exitosamente en {output_file}")




Métricas y matrices de confusión guardadas exitosamente en ../files/output/metrics.json
