In [1]:

import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
import joblib
import os 
from scipy.sparse import load_npz

warnings.filterwarnings('ignore')

load_dir = '../data/gold/'

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment('fraud_random_forest_model')


<Experiment: artifact_location='/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2', creation_time=1723547437350, experiment_id='2', last_update_time=1723547437350, lifecycle_stage='active', name='fraud_random_forest_model', tags={}>

In [3]:
X_train = load_npz(os.path.join(load_dir, 'X_train_scaled.npz'))
X_test = load_npz(os.path.join(load_dir, 'X_test_scaled.npz'))

y_train = joblib.load(os.path.join(load_dir, 'y_train.pkl'))
y_test = joblib.load(os.path.join(load_dir, 'y_test.pkl'))

In [4]:

search_space = {
    'max_depth': hp.quniform('max_depth', 5, 50, 5),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 18) + 2,  
    'n_estimators': hp.quniform('n_estimators', 50, 200, 10),
    'ccp_alpha': hp.uniform('ccp_alpha', 0.0, 0.1),
    'criterion': hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
    'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'random_state': 42,
}

In [5]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Maldu')
        
        # Log data paths
        mlflow.log_param('train-data-path', os.path.join(load_dir, 'X_train_scaled.npz'))
        mlflow.log_param('val-data-path', os.path.join(load_dir, 'X_test_scaled.npz'))
        mlflow.log_param('train-labels-path', os.path.join(load_dir, 'y_train.pkl'))
        mlflow.log_param('val-labels-path', os.path.join(load_dir, 'y_test.pkl'))
        
        # Log parameters
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])
        
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42
        )
        
        model.fit(X_train, y_train)
        
        # Evaluate the model on train 
        train_accuracy = model.score(X_train, y_train)
        mlflow.log_metric('train_accuracy', train_accuracy)
        
        
        # Evaluate the model on test
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        
        test_accuracy = model.score(X_test, y_test)
        mlflow.log_metric('test_accuracy', test_accuracy)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        cm_display = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Classification report 
        report = classification_report(y_test, y_pred, output_dict=True)
        for key, value in report.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    mlflow.log_metric(f'{key}_{sub_key}', sub_value)
            else:
                mlflow.log_metric(f'{key}', value)
        
        # ROC and AUC
        y_prob = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
        mlflow.log_metric('roc_auc', roc_auc)
        
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()
        
        
        # Save the model
        mlflow.sklearn.log_model(model, "model")
        print(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")
        
        return {'loss': -accuracy, 'status': STATUS_OK}


In [6]:
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,  
    trials=trials
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/26ea2b1e6d9c407db8a8386e930ae5ac/artifacts'
  0%|          | 0/10 [00:04<?, ?trial/s, best loss=?]

2024/08/13 13:13:27 INFO mlflow.tracking._tracking_service.client: 🏃 View run unique-mouse-306 at: http://127.0.0.1:5000/#/experiments/2/runs/26ea2b1e6d9c407db8a8386e930ae5ac.

2024/08/13 13:13:27 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 10%|█         | 1/10 [00:04<00:43,  4.87s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/e7d3bb340b534fdc83d9f52c0d38d541/artifacts'
 10%|█         | 1/10 [00:08<00:43,  4.87s/trial, best loss: -0.998]

2024/08/13 13:13:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-ox-769 at: http://127.0.0.1:5000/#/experiments/2/runs/e7d3bb340b534fdc83d9f52c0d38d541.

2024/08/13 13:13:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 20%|██        | 2/10 [00:08<00:30,  3.87s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/69a782cae30a4cddb12e431c426ce0c0/artifacts'
 20%|██        | 2/10 [00:10<00:30,  3.87s/trial, best loss: -0.998]

2024/08/13 13:13:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-snail-806 at: http://127.0.0.1:5000/#/experiments/2/runs/69a782cae30a4cddb12e431c426ce0c0.

2024/08/13 13:13:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 30%|███       | 3/10 [00:10<00:22,  3.25s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/0aa45f1a48ca43b49a4ff26555eb76bb/artifacts'
 30%|███       | 3/10 [00:12<00:22,  3.25s/trial, best loss: -0.998]

2024/08/13 13:13:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-fawn-572 at: http://127.0.0.1:5000/#/experiments/2/runs/0aa45f1a48ca43b49a4ff26555eb76bb.

2024/08/13 13:13:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 40%|████      | 4/10 [00:13<00:17,  2.93s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/f153bf5fc1fd4dc09a076f4ca0db674c/artifacts'
 40%|████      | 4/10 [00:16<00:17,  2.93s/trial, best loss: -0.998]

2024/08/13 13:13:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run enthused-gnat-916 at: http://127.0.0.1:5000/#/experiments/2/runs/f153bf5fc1fd4dc09a076f4ca0db674c.

2024/08/13 13:13:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 50%|█████     | 5/10 [00:16<00:15,  3.05s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/58e33df0bc274f83b8396fbde4580a70/artifacts'
 50%|█████     | 5/10 [00:19<00:15,  3.05s/trial, best loss: -0.998]

2024/08/13 13:13:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run abrasive-penguin-459 at: http://127.0.0.1:5000/#/experiments/2/runs/58e33df0bc274f83b8396fbde4580a70.

2024/08/13 13:13:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 60%|██████    | 6/10 [00:19<00:12,  3.05s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/a65e51ff9f4a4a118c1d2a8bdf2ea5b5/artifacts'
 60%|██████    | 6/10 [00:22<00:12,  3.05s/trial, best loss: -0.998]

2024/08/13 13:13:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-shrew-733 at: http://127.0.0.1:5000/#/experiments/2/runs/a65e51ff9f4a4a118c1d2a8bdf2ea5b5.

2024/08/13 13:13:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 70%|███████   | 7/10 [00:22<00:09,  3.15s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/21f90c4f50204c6a882e6582c6ff855a/artifacts'
 70%|███████   | 7/10 [00:25<00:09,  3.15s/trial, best loss: -0.998]

2024/08/13 13:13:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-stag-284 at: http://127.0.0.1:5000/#/experiments/2/runs/21f90c4f50204c6a882e6582c6ff855a.

2024/08/13 13:13:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 80%|████████  | 8/10 [00:25<00:06,  3.05s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/7c2bd8efba4e4eb591f62023efb1b6c3/artifacts'
 80%|████████  | 8/10 [00:31<00:06,  3.05s/trial, best loss: -0.998]

2024/08/13 13:13:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run traveling-cow-158 at: http://127.0.0.1:5000/#/experiments/2/runs/7c2bd8efba4e4eb591f62023efb1b6c3.

2024/08/13 13:13:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 90%|█████████ | 9/10 [00:31<00:03,  3.92s/trial, best loss: -0.998]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/10812e31a9ab4d2aa8709c3037f70cbd/artifacts'
 90%|█████████ | 9/10 [00:33<00:03,  3.92s/trial, best loss: -0.998]

2024/08/13 13:13:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run stately-deer-491 at: http://127.0.0.1:5000/#/experiments/2/runs/10812e31a9ab4d2aa8709c3037f70cbd.

2024/08/13 13:13:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



100%|██████████| 10/10 [00:34<00:00,  3.40s/trial, best loss: -0.998]


In [7]:
print(f"Best hyperparameters: {best}")

Best hyperparameters: {'bootstrap': 0, 'ccp_alpha': 0.09764165713133799, 'criterion': 1, 'max_depth': 45.0, 'max_features': 1, 'min_samples_leaf': 4.0, 'min_samples_split': 12, 'n_estimators': 130.0}
