# Mlflow Expermintal Notebook

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
test_df = pd.read_csv('../data/cleaned_test.csv')
train_df = pd.read_csv('../data/cleaned_train.csv')

In [5]:
import joblib
pipeline = joblib.load('../artifacts/preprocessing_pipeline.pkl')
train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   is_fraud                 1296675 non-null  int64  
 1   last_hour_count          1296675 non-null  float64
 2   last_hour_avg            1296675 non-null  float64
 3   last_24h_count           1296675 non-null  float64
 4   last_24h_avg             1296675 non-null  float64
 5   dist                     1296675 non-null  float64
 6   dist_diff                1296675 non-null  float64
 7   D_Evening                1296675 non-null  bool   
 8   D_Morning                1296675 non-null  bool   
 9   D_Night                  1296675 non-null  bool   
 10  category_food_dining     1296675 non-null  int32  
 11  category_gas_transport   1296675 non-null  int32  
 12  category_grocery_net     1296675 non-null  int32  
 13  category_grocery_pos     1296675 non-null 

In [7]:
X = train_df.drop('is_fraud', axis=1)
y = train_df['is_fraud']

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [14]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, confusion_matrix, classification_report
import os

# Initialize MLflow Experiment
mlflow.set_experiment("Anomaly Detection")
mlflow.set_tracking_uri("http://localhost:5000")
os.environ["MLFLOW_ARTIFACT_ROOT"] = "file:///D:/final project/FruadDetection/github"

2025/05/01 18:22:57 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.


# Random Forest

In [12]:
def evaluate_supervised_model(model, model_name, X_train, y_train, X_test, y_test):
    """ Train, evaluate, and log a supervised model (Random Forest) in MLflow """
    with mlflow.start_run(run_name=model_name): 
        
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        kappa = cohen_kappa_score(y_test, y_pred)

        # AUC Calculation
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_proba)

        # Log metrics & model in MLflow
        mlflow.log_params(model.get_params())  
        mlflow.log_metrics({"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "kappa": kappa, "roc_auc": auc})
        mlflow.sklearn.log_model(model, model_name)

        print(f"\n🚀 {model_name} Evaluation Completed\n")
        print("Accuracy:", acc)
        print("Precision:", prec)
        print("Recall:", rec)
        print("F1 Score:", f1)
        print("Cohen Kappa Score:", kappa)
        print("ROC-AUC Score:", auc)
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [15]:
# Initialize model
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=345, n_jobs=-1, verbose=1)

# Run evaluation
evaluate_supervised_model(rf_clf, "Random Forest", X_train, y_train, X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   45.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.5s finished



🚀 Random Forest Evaluation Completed

Accuracy: 0.9983701925177955
Precision: 0.9303191489361702
Recall: 0.7766429840142096
F1 Score: 0.8465634075508228
Cohen Kappa Score: 0.8457508281499154
ROC-AUC Score: 0.9941677275265799
Confusion Matrix:
 [[386620    131]
 [   503   1749]]


2025/05/01 18:23:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/554229172752345466/runs/4888726a13d74bbc9d6cd1b806909ed2.
2025/05/01 18:23:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/554229172752345466.



Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386751
           1       0.93      0.78      0.85      2252

    accuracy                           1.00    389003
   macro avg       0.96      0.89      0.92    389003
weighted avg       1.00      1.00      1.00    389003



# Isolation Forest

In [18]:
def evaluate_AnomalyDetection_model(model, model_name, X_train, X_test, y_test):
    """ Train, evaluate, and log an unsupervised model (Isolation Forest) in MLflow """
    with mlflow.start_run(run_name=model_name): 
        
        # Train Isolation Forest (Only X, no Y)
        model.fit(X_train)
        y_pred = model.predict(X_test)
        y_pred = [1 if x == -1 else 0 for x in y_pred]  # Convert anomalies (-1) to fraud (1)

        # Confusion Matrix & Classification Report
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        # Compute ROC-AUC using decision_function
        y_prob = model.decision_function(X_test)
        auc = roc_auc_score(y_test, y_prob)

        # Calculate Additional Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        kappa = cohen_kappa_score(y_test, y_pred)

        # Log metrics & model in MLflow
        mlflow.log_params(model.get_params())  
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "cohen_kappa": kappa,
            "roc_auc": auc
        })
        mlflow.sklearn.log_model(model, model_name)

        print(f"\n🔥 {model_name} Evaluation Completed\n")
        print("Accuracy:", acc)
        print("Precision:", prec)
        print("Recall:", rec)
        print("F1 Score:", f1)
        print("Cohen Kappa Score:", kappa)
        print("ROC-AUC Score:", auc)
        print("Confusion Matrix:\n", cm)
        print("\nClassification Report:\n", report)

In [19]:
# Initialize model
iso_forest = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=1)

# Run evaluation
evaluate_AnomalyDetection_model(iso_forest, "Isolation Forest", X_train, X_test, y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.9s remaining:    2.9s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.2s
2025/05/01 18:26:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run Isolation Forest at: http://localhost:5000/#/experiments/554229172752345466/runs/346cf469c288450ba608f7552d05b9b0.
2025/05/01 18:26:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/554229172752345466.



🔥 Isolation Forest Evaluation Completed

Accuracy: 0.972912805299702
Precision: 0.13441002559350454
Recall: 0.6762877442273535
F1 Score: 0.2242509018626224
Cohen Kappa Score: 0.21668511065723084
ROC-AUC Score: 0.06892917509681568
Confusion Matrix:
 [[376943   9808]
 [   729   1523]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99    386751
           1       0.13      0.68      0.22      2252

    accuracy                           0.97    389003
   macro avg       0.57      0.83      0.61    389003
weighted avg       0.99      0.97      0.98    389003



In [28]:
from mlflow.tracking import MlflowClient
model_name = 'Isolation Forest'
run_id=input('Please type RunID')
model_uri = f'runs:/{run_id}/model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

client = MlflowClient()
run = client.get_run(run_id)

Successfully registered model 'Isolation Forest'.
2025/05/01 18:39:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Isolation Forest, version 1
Created version '1' of model 'Isolation Forest'.
2025/05/01 18:39:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run Isolation Forest at: http://localhost:5000/#/experiments/554229172752345466/runs/346cf469c288450ba608f7552d05b9b0.
2025/05/01 18:39:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/554229172752345466.


In [None]:
model_version = 1
model_uri = f"models:/{model_name}/{model_version}"

loaded_model = mlflow.sklearn.load_model(model_uri)
y_pred = loaded_model.predict(X_test)
y_pred = [1 if x == -1 else 0 for x in y_pred]
print(y_pred)

In [None]:
# dagshub setup
import dagshub
dagshub.init(repo_owner='learnpythonlanguage', repo_name='mlflow_dagshub_demo', mlflow=True)

# # Ideally you will not require following 4 lines if you have started fresh and do not have any previous dagshub credentials on your computer
# import os
# os.environ['MLFLOW_TRACKING_USERNAME'] = 'your user name' # 'learnpythonlanguage'
# os.environ['MLFLOW_TRACKING_PASSWORD'] = 'your password' # 
# os.environ['MLFLOW_TRACKING_URI'] = 'your dagshub unique uri' # https://dagshub.com/learnpythonlanguage/mlflow_dagshub_demo.mlflow
