In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import json
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score
import gzip
import joblib

In [2]:
ruta_train = '../files/input/train_data.csv.zip'
ruta_test = "../files/input/test_data.csv.zip"

ruta_guardado = "files/models/model.pkl"
ruta_output = "files/output/metrics.json"

#ruta_datos_x_test = "files/grading/x_test.pkl"
#ruta_datos_y_test="files/grading/y_test.pkl"
#ruta_datos_x_train="files/grading/x_train.pkl"
#ruta_datos_y_train="files/grading/y_train.pkl"

In [3]:
def read(ruta): 
  
    df = pd.read_csv(ruta, compression='zip')
 
    return df


In [4]:
df_train = read(ruta_train)
df_test = read(ruta_test)

In [5]:
def clear(df):
    
    df = df.copy()
# - Renombre la columna "default payment next month" a "default".
    df = df.rename(columns={"default payment next month":"default"})
# - Remueva la columna "ID".
    df = df.drop(columns=["ID"])
# - Elimine los registros con informacion no disponible.
    df = df.dropna()
# - Para la columna EDUCATION, valores > 4 indican niveles superiores
#   de educación, agrupe estos valores en la categoría "others".  
    df['EDUCATION'] = np.where(df['EDUCATION'] > 4, 4, df['EDUCATION'])
    df = df.loc[df['EDUCATION'] != 0]
    df = df.loc[df['MARRIAGE'] != 0]
## redefinir las columnas categoricas 
    #df["SEX"] = df["SEX"].map({1: "male", 2: "female"},)
    #df["EDUCATION"] = df["EDUCATION"].map({ 0:"N/A", 1:"graduate school", 2:"university", 3:"high school", 4:"others"},)
    #df["MARRIAGE"] = df["MARRIAGE"].map({ 0:"N/A", 1:"married", 2:"single", 3:"others"},)

    return df

In [6]:
df_train = clear(df_train)
df_test=clear(df_test)
print(df_train["EDUCATION"].value_counts())
print(df_train["MARRIAGE"].value_counts())
print(df_test["EDUCATION"].value_counts())
print(df_test["MARRIAGE"].value_counts())

EDUCATION
2    9756
1    7476
3    3396
4     325
Name: count, dtype: int64
MARRIAGE
2    11226
1     9502
3      225
Name: count, dtype: int64
EDUCATION
2    4268
1    3105
3    1477
4     129
Name: count, dtype: int64
MARRIAGE
2    4728
1    4153
3      98
Name: count, dtype: int64


In [7]:
def dividr_datos(df):
    df = df.copy()
    X=df.drop(columns=["default"])
    y=df["default"]
    return  X,y

In [8]:
x_train, y_train=dividr_datos(df_train)
x_test, y_test= dividr_datos(df_test)

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, balanced_accuracy_score

# Columnas categóricas y numéricas
categorical_columns = ['SEX', 'EDUCATION', 'MARRIAGE']
numerical_features = [col for col in x_train if col not in categorical_columns]

# Preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns),  # One-Hot Encoding
        ('scaler', StandardScaler(), numerical_features),  # Escalar al rango [0, 1]
    ]
)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=None)),  # PCA con todas las componentes
    ('feature_selection', SelectKBest(score_func=f_classif)),  # Selección de características
    ('classifier', MLPClassifier(random_state=42, max_iter=1000))  # Red neuronal MLP
])

# Definir la búsqueda de hiperparámetros
param_grid = {
    # Parámetros para SelectKBest
    'feature_selection__k': [20],  # Número de características a seleccionar
    
    # Parámetros para MLPClassifier
    'classifier__hidden_layer_sizes': [(100,50,100,50,100,100), ],  # Tamaño de capas ocultas
    'classifier__activation': ['relu'],  # Función de activación
    #'classifier__solver': ['adam', 'sgd'],  # Optimizadores
    'classifier__alpha': [0.5],  # Regularización L2
    #'classifier__early_stopping':[True],
    'classifier__learning_rate_init': [ 0.0003]

    
}

# Configurar la métrica de precisión balanceada
scorer = make_scorer(balanced_accuracy_score)

# Crear el objeto de búsqueda
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=10, 
    scoring=scorer, 
    n_jobs=-1,
    refit=True,
)

# Ajustar el modelo
modelo = grid_search.fit(x_train, y_train)

# Resultados
print("Mejores parámetros encontrados:", grid_search.best_params_)
print("Mejor puntuación de validación cruzada:", grid_search.best_score_)

# Evaluar el modelo en el conjunto de prueba
y_pred = grid_search.best_estimator_.predict(x_train)
test_score = balanced_accuracy_score(y_train, y_pred)
print("Precisión balanceada en el conjunto de prueba:", test_score)

print(precision_score(y_train, y_pred, zero_division=0))


Mejores parámetros encontrados: {'classifier__activation': 'relu', 'classifier__alpha': 0.5, 'classifier__hidden_layer_sizes': (100, 50, 100, 50, 100, 100), 'classifier__learning_rate_init': 0.0003, 'feature_selection__k': 20}
Mejor puntuación de validación cruzada: 0.6503162685675925
Precisión balanceada en el conjunto de prueba: 0.6640888372438779
0.6964705882352941


In [54]:
ruta_guardado = "../files/models/model.pkl.gz"

# Asegúrate de que el directorio existe
ruta_guardado_carpeta = os.path.dirname(ruta_guardado)
os.makedirs(ruta_guardado_carpeta, exist_ok=True)

# Guarda el modelo
with gzip.open(ruta_guardado, 'wb') as f:
    pickle.dump(grid_search, f)

In [55]:
def calculate_and_save_metrics(model, X_train, X_test, y_train, y_test):
    # Hacer predicciones
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calcular métricas para el conjunto de entrenamiento
    metrics_train = {
        'type': 'metrics',
        'dataset': 'train',
        'precision': precision_score(y_train, y_train_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_train, y_train_pred),
        'recall': recall_score(y_train, y_train_pred, zero_division=0),
        'f1_score': f1_score(y_train, y_train_pred, zero_division=0)
    }

    # Calcular métricas para el conjunto de prueba
    metrics_test = {
        'type': 'metrics',
        'dataset': 'test',
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'balanced_accuracy': balanced_accuracy_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_test_pred, zero_division=0)
    }

    # Crear carpeta si no existe
    output_dir = '../files/output'
    os.makedirs(output_dir, exist_ok=True)

    # Guardar las métricas en un archivo JSON
    output_path = os.path.join(output_dir, 'metrics.json')
    with open(output_path, 'w') as f:  # Usar 'w' para comenzar con un archivo limpio
        f.write(json.dumps(metrics_train) + '\n')
        f.write(json.dumps(metrics_test) + '\n')


In [56]:
def calculate_and_save_confusion_matrices(model, X_train, X_test, y_train, y_test):
    # Hacer predicciones
    y_train_pred = modelo.predict(x_train)
    y_test_pred = modelo.predict(x_test)

    # Calcular matrices de confusión
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    # Convertir las matrices de confusión en formato JSON
    def format_confusion_matrix(cm, dataset_type):
        return {
            'type': 'cm_matrix',
            'dataset': dataset_type,
            'true_0': {
                'predicted_0': int(cm[0, 0]),
                'predicted_1': int(cm[0, 1])
            },
            'true_1': {
                'predicted_0': int(cm[1, 0]),
                'predicted_1': int(cm[1, 1])
            }
        }

    metrics = [
        format_confusion_matrix(cm_train, 'train'),
        format_confusion_matrix(cm_test, 'test')
    ]

    # Guardar las matrices de confusión en el mismo archivo JSON
    output_path = '../files/output/metrics.json'
    with open(output_path, 'a') as f:  # Usar 'a' para agregar después de las métricas
        for metric in metrics:
            f.write(json.dumps(metric) + '\n')

# Función principal para ejecutar todo
def main(modelo, X_train, X_test, y_train, y_test):
    # Crear el directorio de salida si no existe
    import os
    os.makedirs('../files/output', exist_ok=True)

    # Calcular y guardar las métricas
    calculate_and_save_metrics(modelo, X_train, X_test, y_train, y_test)

    # Calcular y guardar las matrices de confusión
    calculate_and_save_confusion_matrices(modelo, X_train, X_test, y_train, y_test)

# Ejemplo de uso:
main(modelo, x_train, x_test, y_train, y_test)


In [21]:
def _load_model():
    ruta_guardado = "../files/models/model.pkl.gz"
    with gzip.open(ruta_guardado, 'rb') as f:
        model=joblib.load(f)
        return model


In [None]:
model = _load_model()
print(type(model))  # Asegúrate de que sea <class 'GridSearchCV'>


In [23]:
def _load_model():
    ruta_guardado = "../files/models/model.pkl.gz"
    """Generic test to load a model"""
    with gzip.open(ruta_guardado, "rb") as file:
        model = pickle.load(file)
    return model


In [None]:
modelo=_load_model()
print(type(modelo))
modelo