In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
import os 


warnings.filterwarnings('ignore')

load_dir = '../data/gold/'

# mlflow server --backend-store-uri sqlite:///backend.db --default-artifact-root ./artifacts_local

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment('fraud_random_forest_model')


<Experiment: artifact_location='/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2', creation_time=1723978570179, experiment_id='2', last_update_time=1723978570179, lifecycle_stage='active', name='fraud_random_forest_model', tags={}>

In [3]:
# X_train = load_npz(os.path.join(load_dir, 'X_train_scaled.npz'))
# X_test = load_npz(os.path.join(load_dir, 'X_test_scaled.npz'))

# y_train = joblib.load(os.path.join(load_dir, 'y_train.pkl'))
# y_test = joblib.load(os.path.join(load_dir, 'y_test.pkl'))

data = pd.read_parquet('../data/gold/df_fraud_final.parquet', engine= 'fastparquet')
data

Unnamed: 0_level_0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,day_of_month,amount_range,diffbalanceOrig,diffbalanceDest
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1044,1,PAYMENT,4.518420e+03,C1504321715,8.790000e+03,4.271580e+03,M910123336,0.000000e+00,0.000000e+00,no_fraud,1,1.000-10.000,-4518.419922,0.00000
439,1,CASH_IN,1.314093e+05,C1476235721,8.491742e+06,8.623151e+06,C1068824137,2.902760e+05,2.650924e+05,no_fraud,1,100.000-1.000.000,131409.000000,-25183.65625
3718,2,PAYMENT,5.621400e+02,C837981622,7.221367e+05,7.215745e+05,M2053668237,0.000000e+00,0.000000e+00,no_fraud,1,0-1.000,-562.187500,0.00000
2919,2,CASH_IN,8.851071e+04,C2145921383,5.775554e+06,5.864064e+06,C977993101,1.180073e+06,7.442649e+05,no_fraud,1,10.000-100.000,88510.500000,-435807.81250
3957,3,CASH_IN,1.684791e+05,C1644155208,6.673927e+06,6.842406e+06,C575335780,3.086425e+05,1.401634e+05,no_fraud,1,100.000-1.000.000,168479.000000,-168479.09375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362584,741,TRANSFER,5.674548e+06,C992223106,5.674548e+06,5.674548e+06,C1366804249,0.000000e+00,0.000000e+00,fraud,31,1.000.000-10.000.000,0.000000,0.00000
6362608,742,TRANSFER,2.583554e+05,C1226129332,2.583554e+05,0.000000e+00,C1744173808,0.000000e+00,0.000000e+00,fraud,31,100.000-1.000.000,-258355.421875,0.00000
6362601,742,CASH_OUT,6.529939e+05,C1614818636,6.529939e+05,0.000000e+00,C362803701,0.000000e+00,6.529939e+05,fraud,31,100.000-1.000.000,-652993.937500,652993.93750
6362619,743,CASH_OUT,8.500025e+05,C1280323807,8.500025e+05,0.000000e+00,C873221189,6.510099e+06,7.360102e+06,fraud,31,100.000-1.000.000,-850002.500000,850002.50000


In [4]:
categorical_features = ['type', 'nameDest']
numeric_features = ['step', 'amount', 'oldbalanceOrig', 'oldbalanceDest', 'diffbalanceOrig', 'diffbalanceDest']

data = data.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})
data['diffbalanceOrig'] = data['newbalanceOrig'] - data['oldbalanceOrig']
data['diffbalanceDest'] = data['newbalanceDest'] - data['oldbalanceDest']
data = data.drop(columns=['newbalanceOrig', 'nameOrig', 'newbalanceDest'])
data['isFraud'] = data['isFraud'].map({'fraud': 1, 'no_fraud': 0})
# data['type'] = data['type'].map({'CASH_OUT': 1, 'PAYMENT': 2, 'CASH_IN': 3, 'TRANSFER': 4, 'DEBIT': 5})

In [5]:
X = data.drop(columns= 'isFraud')
y = data['isFraud']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X_train = preprocessor.fit_transform(X_train)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

X_test = preprocessor.transform(X_test)



In [7]:
search_space = {
    'max_depth': hp.quniform('max_depth', 5, 50, 5),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1), 
    'n_estimators': hp.quniform('n_estimators', 50, 200, 10),
    'ccp_alpha': hp.uniform('ccp_alpha', 0.0, 0.1),
    'criterion': hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
    'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'random_state': 42,  
}


In [8]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('developer', 'Maldu')
        
        # Log data paths
        mlflow.log_param('train-data-path', os.path.join(load_dir, 'X_train_scaled.npz'))
        mlflow.log_param('val-data-path', os.path.join(load_dir, 'X_test_scaled.npz'))
        mlflow.log_param('train-labels-path', os.path.join(load_dir, 'y_train.pkl'))
        mlflow.log_param('val-labels-path', os.path.join(load_dir, 'y_test.pkl'))
        
        # Log parameters
        n_estimators = int(params['n_estimators'])
        max_depth = int(params['max_depth'])
        min_samples_split = int(params['min_samples_split'])
        
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_depth)
        mlflow.log_param('min_samples_split', min_samples_split)
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            random_state=42
        )
        
        model.fit(X_train, y_train)
        
        # Evaluate the model on train 
        train_accuracy = model.score(X_train, y_train)
        mlflow.log_metric('train_accuracy', train_accuracy)
        
        
        # Evaluate the model on test
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        
        test_accuracy = model.score(X_test, y_test)
        mlflow.log_metric('test_accuracy', test_accuracy)
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        cm_display = sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Classification report 
        report = classification_report(y_test, y_pred, output_dict=True)
        for key, value in report.items():
            if isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    mlflow.log_metric(f'{key}_{sub_key}', sub_value)
            else:
                mlflow.log_metric(f'{key}', value)
        
        # ROC and AUC
        y_prob = model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
        mlflow.log_metric('roc_auc', roc_auc)
        
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        plt.close()
        
        
        # Save the model
        mlflow.sklearn.log_model(model, "model")
        print(f"Default artifacts URI: '{mlflow.get_artifact_uri()}'")
        
        return {'loss': -accuracy, 'status': STATUS_OK}


In [9]:
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,  
    trials=trials
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/5658638a26734f5f9870d7b74e620482/artifacts'
  0%|          | 0/10 [00:02<?, ?trial/s, best loss=?]

2024/08/18 12:57:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-rook-964 at: http://127.0.0.1:5000/#/experiments/2/runs/5658638a26734f5f9870d7b74e620482.

2024/08/18 12:57:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 10%|█         | 1/10 [00:02<00:25,  2.79s/trial, best loss: -0.9641255605381166]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/0982ed8010014d1c96cf77a5f5ac0f3b/artifacts'
 10%|█         | 1/10 [00:05<00:25,  2.79s/trial, best loss: -0.9641255605381166]

2024/08/18 12:57:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run exultant-goose-653 at: http://127.0.0.1:5000/#/experiments/2/runs/0982ed8010014d1c96cf77a5f5ac0f3b.

2024/08/18 12:57:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 20%|██        | 2/10 [00:05<00:20,  2.51s/trial, best loss: -0.9641255605381166]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/a86c7a9b24044cbcb5896d29f4616cfb/artifacts'
 20%|██        | 2/10 [00:07<00:20,  2.51s/trial, best loss: -0.9641255605381166]

2024/08/18 12:57:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-sheep-637 at: http://127.0.0.1:5000/#/experiments/2/runs/a86c7a9b24044cbcb5896d29f4616cfb.

2024/08/18 12:57:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 30%|███       | 3/10 [00:07<00:15,  2.27s/trial, best loss: -0.9641255605381166]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/fc7d0913f37945d192cd02bfd22279f2/artifacts'
 30%|███       | 3/10 [00:09<00:15,  2.27s/trial, best loss: -0.9641255605381166]

2024/08/18 12:57:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-mole-614 at: http://127.0.0.1:5000/#/experiments/2/runs/fc7d0913f37945d192cd02bfd22279f2.

2024/08/18 12:57:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 40%|████      | 4/10 [00:09<00:13,  2.18s/trial, best loss: -0.968609865470852] 




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/4f1e3b54f1cd41d6a72b825b610a3197/artifacts'
 40%|████      | 4/10 [00:11<00:13,  2.18s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run polite-fly-759 at: http://127.0.0.1:5000/#/experiments/2/runs/4f1e3b54f1cd41d6a72b825b610a3197.

2024/08/18 12:57:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 50%|█████     | 5/10 [00:11<00:10,  2.16s/trial, best loss: -0.968609865470852]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/a29163c4d0a44689b5ca9e4f04a6a45a/artifacts'
 50%|█████     | 5/10 [00:13<00:10,  2.16s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run burly-bird-57 at: http://127.0.0.1:5000/#/experiments/2/runs/a29163c4d0a44689b5ca9e4f04a6a45a.

2024/08/18 12:57:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 60%|██████    | 6/10 [00:13<00:08,  2.21s/trial, best loss: -0.968609865470852]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/e8535a4ed59448ab81d9a63dea1a1749/artifacts'
 60%|██████    | 6/10 [00:15<00:08,  2.21s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run invincible-fawn-710 at: http://127.0.0.1:5000/#/experiments/2/runs/e8535a4ed59448ab81d9a63dea1a1749.

2024/08/18 12:57:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 70%|███████   | 7/10 [00:15<00:06,  2.11s/trial, best loss: -0.968609865470852]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/6b8a50816f4d4eb489ed1b4d47cc6ed9/artifacts'
 70%|███████   | 7/10 [00:17<00:06,  2.11s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run debonair-goose-352 at: http://127.0.0.1:5000/#/experiments/2/runs/6b8a50816f4d4eb489ed1b4d47cc6ed9.

2024/08/18 12:57:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 80%|████████  | 8/10 [00:17<00:04,  2.02s/trial, best loss: -0.968609865470852]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/311ec8dca66147b185edaa7d2ef09d38/artifacts'
 80%|████████  | 8/10 [00:19<00:04,  2.02s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run adventurous-carp-978 at: http://127.0.0.1:5000/#/experiments/2/runs/311ec8dca66147b185edaa7d2ef09d38.

2024/08/18 12:57:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



 90%|█████████ | 9/10 [00:19<00:01,  2.00s/trial, best loss: -0.968609865470852]




Default artifacts URI: '/home/maldu/dscience/projects/fraud_detection/experiments/artifacts_local/2/3de6f69b123c4f0c954b4068613094b6/artifacts'
 90%|█████████ | 9/10 [00:21<00:01,  2.00s/trial, best loss: -0.968609865470852]

2024/08/18 12:57:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run caring-ray-654 at: http://127.0.0.1:5000/#/experiments/2/runs/3de6f69b123c4f0c954b4068613094b6.

2024/08/18 12:57:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.



100%|██████████| 10/10 [00:21<00:00,  2.12s/trial, best loss: -0.968609865470852]


In [10]:
print(f"Best hyperparameters: {best}")

Best hyperparameters: {'bootstrap': 0, 'ccp_alpha': 0.07354053731740971, 'criterion': 0, 'max_depth': 45.0, 'max_features': 0, 'min_samples_leaf': 7.0, 'min_samples_split': 3.0, 'n_estimators': 110.0}
