In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import mlflow
import mlflow.sklearn
import joblib
from scipy.sparse import csr_matrix
from scipy.sparse import load_npz
import joblib
import os
load_dir = '../../data/gold/'

In [2]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment('fraud_detection_experiment_v1')

experiment = mlflow.get_experiment_by_name('fraud_detection_experiment_v1')

if experiment:
    print(f"ID: {experiment.experiment_id}, Name: {experiment.name}")
else:
    print("The experiment doesn't exist")

ID: 2, Name: fraud_detection_experiment_v1


In [3]:
try:
    experiments = mlflow.search_experiments()
    for exp in experiments:
        print(f"ID: {exp.experiment_id}, Name: {exp.name}")
except AttributeError as e:
    print(f"Error: {e}")

ID: 2, Name: fraud_detection_experiment_v1
ID: 1, Name: model-experiment-v1
ID: 0, Name: Default


In [4]:
X_train_scaled = load_npz(os.path.join(load_dir, 'X_train_scaled.npz'))
X_test_scaled = load_npz(os.path.join(load_dir, 'X_test_scaled.npz'))

# Cargar etiquetas
y_train = joblib.load(os.path.join(load_dir, 'y_train.pkl'))
y_test = joblib.load(os.path.join(load_dir, 'y_test.pkl'))

In [5]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

In [6]:
search_spaces = {
    'Logistic Regression': {
        'C': hp.loguniform('C', -4, 4),
        'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
        'max_iter': hp.quniform('max_iter', 100, 1000, 50),
    },
    'Random Forest': {
        'n_estimators': hp.quniform('n_estimators', 10, 200, 10),
        'max_depth': hp.quniform('max_depth', 1, 50, 1),
        'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.5),
    },
    'Decision Tree': {
        'max_depth': hp.quniform('max_depth', 1, 50, 1),
        'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.5),
    },
    'Naive Bayes': {}  # Naive Bayes no tiene hiperparámetros ajustables en este caso
}

In [7]:
def objective(params, model_name):
    with mlflow.start_run():
        # Inicializa el modelo
        if model_name == 'Logistic Regression':
            model = LogisticRegression(
                C=params['C'],
                solver=params['solver'],
                max_iter=int(params['max_iter'])
            )
            # Registrar hiperparámetros
            mlflow.log_param('C', params['C'])
            mlflow.log_param('solver', params['solver'])
            mlflow.log_param('max_iter', int(params['max_iter']))
        elif model_name == 'Random Forest':
            model = RandomForestClassifier(
                n_estimators=int(params['n_estimators']),
                max_depth=int(params['max_depth']),
                min_samples_split=params['min_samples_split'],
                random_state=42
            )
            # Registrar hiperparámetros
            mlflow.log_param('n_estimators', int(params['n_estimators']))
            mlflow.log_param('max_depth', int(params['max_depth']))
            mlflow.log_param('min_samples_split', params['min_samples_split'])
        elif model_name == 'Decision Tree':
            model = DecisionTreeClassifier(
                max_depth=int(params['max_depth']),
                min_samples_split=params['min_samples_split'],
                random_state=42
            )
            # Registrar hiperparámetros
            mlflow.log_param('max_depth', int(params['max_depth']))
            mlflow.log_param('min_samples_split', params['min_samples_split'])
        elif model_name == 'Naive Bayes':
            model = GaussianNB()
            # No hay hiperparámetros para Naive Bayes
        else:
            raise ValueError(f"Modelo no soportado: {model_name}")
        
        # Entrena el modelo
        model.fit(X_train_scaled, y_train)
        
        # Evalúa el modelo
        accuracy = model.score(X_test_scaled, y_test)
        
        # Registrar la métrica
        mlflow.log_metric('accuracy', accuracy)
        
        # Guarda el modelo
        mlflow.sklearn.log_model(model, "model")
        
        # Retorna la métrica negativa para optimización (Hyperopt minimiza la función objetivo)
        return {'loss': -accuracy, 'status': STATUS_OK}

In [8]:
for model_name in search_spaces.keys():
    print(f"Optimización de hiperparámetros para {model_name}...")
    
    # Crear una nueva instancia de Trials para cada modelo
    trials = Trials()
    
    # Ejecutar la optimización de hiperparámetros
    best = fmin(
        fn=lambda params: objective(params, model_name),
        space=search_spaces[model_name],
        algo=tpe.suggest,
        max_evals=50,  # Ajusta el número de evaluaciones según sea necesario
        trials=trials
    )
    
    print(f"Mejores hiperparámetros para {model_name}: {best}")

Optimización de hiperparámetros para Logistic Regression...
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [16:40<13:36:40, 1000.01s/trial, best loss: -0.9991905850105774]