In [8]:
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import joblib
from scipy.sparse import load_npz


import warnings
warnings.filterwarnings('ignore')

load_dir = '../../data/gold/'

In [9]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment('fraud_detection_reduced_experiment_v1')

# experiment = mlflow.get_experiment_by_name('fraud_detection_reduced_experiment_v1')

# if experiment:
#     print(f"ID: {experiment.experiment_id}, Name: {experiment.name}")
# else:
#     print("The experiment doesn't exist")

<Experiment: artifact_location='/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3', creation_time=1723021074916, experiment_id='3', last_update_time=1723021074916, lifecycle_stage='active', name='fraud_detection_reduced_experiment_v1', tags={}>

In [10]:
try:
    experiments = mlflow.search_experiments()
    for exp in experiments:
        print(f"ID: {exp.experiment_id}, Name: {exp.name}")
except AttributeError as e:
    print(f"Error: {e}")

ID: 3, Name: fraud_detection_reduced_experiment_v1
ID: 2, Name: fraud_detection_experiment_v1
ID: 1, Name: model-experiment-v1
ID: 0, Name: Default


In [11]:
X_train_scaled = load_npz(os.path.join(load_dir, 'X_train_scaled.npz'))
X_test_scaled = load_npz(os.path.join(load_dir, 'X_test_scaled.npz'))

y_train = joblib.load(os.path.join(load_dir, 'y_train.pkl'))
y_test = joblib.load(os.path.join(load_dir, 'y_test.pkl'))

In [12]:
search_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 100, 10),
    'max_depth': hp.quniform('max_depth', 10, 20, 2),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 0.2),
}

In [13]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Ana')
        
        # Log data paths
        mlflow.log_param('train-data-path', os.path.join(load_dir, 'X_train_scaled.npz'))
        mlflow.log_param('val-data-path', os.path.join(load_dir, 'X_test_scaled.npz'))
        mlflow.log_param('train-labels-path', os.path.join(load_dir, 'y_train.pkl'))
        mlflow.log_param('val-labels-path', os.path.join(load_dir, 'y_test.pkl'))
        
        # Log parameters
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        min_samples_split = params['min_samples_split']
        
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42
        )
        
        model.fit(X_train_scaled, y_train)
        
        # Evaluar el modelo en los datos de prueba
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        # Evaluar el modelo en los datos de entrenamiento
        train_accuracy = model.score(X_train_scaled, y_train)
        mlflow.log_metric('train_accuracy', train_accuracy)
        
        test_accuracy = model.score(X_test_scaled, y_test)
        mlflow.log_metric('test_accuracy', test_accuracy)
        # Calcular y mostrar la matriz de confusión
        cm = confusion_matrix(y_test, y_pred)
        cm_display = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Mostrar el reporte de clasificación
        report = classification_report(y_test, y_pred, output_dict=True)
        for key, value in report.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    mlflow.log_metric(f'{key}_{sub_key}', sub_value)
            else:
                mlflow.log_metric(f'{key}', value)
        
        # Calcular y mostrar la curva ROC y el AUC
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
        mlflow.log_metric('roc_auc', roc_auc)
        
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()
        
        # Guardar el modelo
        mlflow.sklearn.log_model(model, "model")
        print(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")
        
        return {'loss': -accuracy, 'status': STATUS_OK}

In [14]:
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,  
    trials=trials
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/bb118cfa08104571af7e3bb6707ea282/artifacts'
 10%|█         | 1/10 [02:06<18:58, 126.53s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/250b3066baa743439e2648f633bca496/artifacts'
 20%|██        | 2/10 [04:02<16:00, 120.03s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/e6bad078261249d88a77893adf0100a7/artifacts'
 30%|███       | 3/10 [05:55<13:38, 116.90s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/c6a3942243b34a20b55a78575bea8193/artifacts'
 40%|████      | 4/10 [07:56<11:52, 118.79s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/4ba77f08f82e4ca89705c9fc331a3efd/artifacts'
 50%|█████     | 5/10 [10:57<11:44, 140.99s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/03f0722120404dd89f3aef249ef41692/artifacts'
 60%|██████    | 6/10 [13:00<09:00, 135.12s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/0458af4f85994657a39be758cdfff288/artifacts'
 70%|███████   | 7/10 [14:36<06:06, 122.08s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/fa042f8fe8c54ea7a8c7c124329e5762/artifacts'
 80%|████████  | 8/10 [16:16<03:50, 115.30s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/ed624b9087ec4214b63edde7adf770eb/artifacts'
 90%|█████████ | 9/10 [17:56<01:50, 110.32s/trial, best loss: -0.9987269395311994]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/notebooks/experiments/mlruns/3/72915f2752604ff0be011fb1489b82f2/artifacts'
100%|██████████| 10/10 [19:29<00:00, 116.92s/trial, best loss: -0.9987269395311994]


In [15]:
print(f"Mejores hiperparámetros: {best}")

Mejores hiperparámetros: {'max_depth': 12.0, 'min_samples_split': 0.10553180327691568, 'n_estimators': 80.0}


In [16]:
unbalanced needs fp and fn

SyntaxError: invalid syntax (3586699426.py, line 1)