## Import

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings

warnings.filterwarnings("ignore")

## Create Dataset

In [2]:
# Step 1: Create an imbalanced binary classification dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=2,
    n_redundant=8,
    weights=[0.9, 0.1],
    flip_y=0,
    random_state=42,
)

np.unique(y, return_counts=True)

(array([0, 1]), array([900, 100], dtype=int64))

In [3]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

### Experiment 1: Train Logistic Regression Classifier

In [15]:
# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

In [16]:
# Train the model
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,8888
,solver,'lbfgs'
,max_iter,1000


In [17]:
# Predict on the test set
y_pred = lr.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       270
           1       0.62      0.50      0.56        30

    accuracy                           0.92       300
   macro avg       0.79      0.73      0.76       300
weighted avg       0.91      0.92      0.92       300



In [18]:
report_dict: dict = classification_report(y_test, y_pred, output_dict=True)
report_dict

{'0': {'precision': 0.9456521739130435,
  'recall': 0.9666666666666667,
  'f1-score': 0.9560439560439561,
  'support': 270.0},
 '1': {'precision': 0.625,
  'recall': 0.5,
  'f1-score': 0.5555555555555556,
  'support': 30.0},
 'accuracy': 0.92,
 'macro avg': {'precision': 0.7853260869565217,
  'recall': 0.7333333333333334,
  'f1-score': 0.7557997557997558,
  'support': 300.0},
 'weighted avg': {'precision': 0.9135869565217392,
  'recall': 0.92,
  'f1-score': 0.9159951159951161,
  'support': 300.0}}

In [19]:
report_dict["accuracy"]

0.92

In [20]:
import mlflow
from mlflow.sklearn import log_model

In [21]:
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [22]:
with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics(
        {
            "accuracy": report_dict["accuracy"],
            "recall_class_0": report_dict["0"]["recall"],
            "recall_class_1": report_dict["1"]["recall"],
            "f1_score_macro": report_dict["macro avg"]["f1-score"],
        }
    )
    log_model(lr, "Logistic Regression")

2025/11/15 16:41:38 INFO mlflow.tracking._tracking_service.client: üèÉ View run resilient-seal-843 at: http://127.0.0.1:5000/#/experiments/783299024493711398/runs/7fed58f723ba4ab9986608c6f319bdf6.
2025/11/15 16:41:38 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/783299024493711398.


### Experiment 2: Train Random Forest Classifier

In [23]:
rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       270
           1       0.95      0.70      0.81        30

    accuracy                           0.97       300
   macro avg       0.96      0.85      0.89       300
weighted avg       0.97      0.97      0.96       300



### Experiment 3: Train XGBoost

In [24]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_clf.fit(X_train, y_train)

y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       270
           1       0.96      0.80      0.87        30

    accuracy                           0.98       300
   macro avg       0.97      0.90      0.93       300
weighted avg       0.98      0.98      0.98       300



### Experiment 4: Handle class imbalance using SMOTETomek and then Train XGBoost

In [4]:
from imblearn.combine import SMOTETomek

In [5]:
smt = SMOTETomek(random_state=42)

X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

np.unique(y_train_res, return_counts=True)

(array([0, 1]), array([619, 619], dtype=int64))

In [27]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_clf.fit(X_train_res, y_train_res)

y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       270
           1       0.81      0.83      0.82        30

    accuracy                           0.96       300
   macro avg       0.89      0.91      0.90       300
weighted avg       0.96      0.96      0.96       300



<h2 align="center" style="color:blue">Track Experiments Using MLFlow</h2>

In [35]:
models = [
    (
        "Logistic Regression",
        LogisticRegression(C=1, solver="liblinear"),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "Random Forest",
        RandomForestClassifier(n_estimators=30, max_depth=3),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "XGBClassifier",
        XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "XGBClassifier With SMOTE",
        XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        (X_train_res, y_train_res),
        (X_test, y_test),
    ),
]

In [36]:
reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [37]:
reports

[{'0': {'precision': 0.9834710743801653,
   'recall': 0.8814814814814815,
   'f1-score': 0.9296875,
   'support': 270.0},
  '1': {'precision': 0.4482758620689655,
   'recall': 0.8666666666666667,
   'f1-score': 0.5909090909090909,
   'support': 30.0},
  'accuracy': 0.88,
  'macro avg': {'precision': 0.7158734682245654,
   'recall': 0.8740740740740741,
   'f1-score': 0.7602982954545454,
   'support': 300.0},
  'weighted avg': {'precision': 0.9299515531490453,
   'recall': 0.88,
   'f1-score': 0.8958096590909091,
   'support': 300.0}},
 {'0': {'precision': 0.988,
   'recall': 0.9148148148148149,
   'f1-score': 0.95,
   'support': 270.0},
  '1': {'precision': 0.54, 'recall': 0.9, 'f1-score': 0.675, 'support': 30.0},
  'accuracy': 0.9133333333333333,
  'macro avg': {'precision': 0.764,
   'recall': 0.9074074074074074,
   'f1-score': 0.8125,
   'support': 300.0},
  'weighted avg': {'precision': 0.9431999999999999,
   'recall': 0.9133333333333333,
   'f1-score': 0.9225,
   'support': 300.0}}

In [6]:
import mlflow

In [39]:
mlflow.set_experiment("Anomaly Detection")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [41]:
for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    # print(report)

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metrics(
            {
                "accuracy": report["accuracy"],
                "recall_class_1": report["1"]["recall"],
                "recall_class_0": report["0"]["recall"],
                "f1_score_macro": report["macro avg"]["f1-score"],
            }
        )

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

2025/11/15 17:04:10 INFO mlflow.tracking._tracking_service.client: üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/233189936496821645/runs/877157f726ec4e62af1a6ce857643b2f.
2025/11/15 17:04:10 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/233189936496821645.
2025/11/15 17:04:17 INFO mlflow.tracking._tracking_service.client: üèÉ View run Random Forest at: http://127.0.0.1:5000/#/experiments/233189936496821645/runs/63a05c7b7a914d459cb19344da8b830b.
2025/11/15 17:04:17 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/experiments/233189936496821645.
2025/11/15 17:04:22 INFO mlflow.tracking._tracking_service.client: üèÉ View run XGBClassifier at: http://127.0.0.1:5000/#/experiments/233189936496821645/runs/8c8920d9508746bea11b751f692b7c05.
2025/11/15 17:04:22 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://127.0.0.1:5000/#/exp

## Track Experiments
### Models

In [7]:
models = [
    (
        "Logistic Regression",
        {"C": 1, "solver": "liblinear"},
        LogisticRegression(),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "Random Forest",
        {"n_estimators": 30, "max_depth": 3},
        RandomForestClassifier(),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "XGBClassifier",
        {"use_label_encoder": False, "eval_metric": "logloss"},
        XGBClassifier(),
        (X_train, y_train),
        (X_test, y_test),
    ),
    (
        "XGBClassifier With SMOTE",
        {"use_label_encoder": False, "eval_metric": "logloss"},
        XGBClassifier(),
        (X_train_res, y_train_res),
        (X_test, y_test),
    ),
]

### Reports

In [8]:
reports = []

for model_name, params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

### Initialize MLflow

In [20]:
mlflow.set_experiment("Anomaly Detection")
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_tracking_uri("http://localhost:5000/")

2025/11/16 16:24:44 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.


In [21]:
for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_metrics(
            {
                "accuracy": report["accuracy"],
                "recall_class_1": report["1"]["recall"],
                "recall_class_0": report["0"]["recall"],
                "f1_score_macro": report["macro avg"]["f1-score"],
            }
        )

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

2025/11/16 16:25:06 INFO mlflow.tracking._tracking_service.client: üèÉ View run Logistic Regression at: http://localhost:5000//#/experiments/1/runs/7b5058ac1a9643978b3facfa307c06ff.
2025/11/16 16:25:06 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000//#/experiments/1.
2025/11/16 16:25:10 INFO mlflow.tracking._tracking_service.client: üèÉ View run Random Forest at: http://localhost:5000//#/experiments/1/runs/eb8c7930a8684518a02db4e760015504.
2025/11/16 16:25:10 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000//#/experiments/1.
2025/11/16 16:25:14 INFO mlflow.tracking._tracking_service.client: üèÉ View run XGBClassifier at: http://localhost:5000//#/experiments/1/runs/0b51bee041784cd8b63c64c4e2c12c51.
2025/11/16 16:25:14 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: http://localhost:5000//#/experiments/1.
2025/11/16 16:25:19 INFO mlflow.tracking._tracking_service.client: 

### Register Model

In [None]:
# result = mlflow.register_model(
#     "runs:/d16076a3ec534311817565e6527539c0/sklearn-model", "sk-learn-random-forest-reg"
# )

In [22]:
# model_run_id='fd9e472a3aff47b08486682a0e326b7c'
model_run_id = input("Enter Run ID:")
model_name = "XGB-SMOTE"
model_uri = f"runs:/{model_run_id}/model"

result = mlflow.register_model(model_uri, model_name)

Successfully registered model 'XGB-SMOTE'.
2025/11/16 16:25:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGB-SMOTE, version 1
Created version '1' of model 'XGB-SMOTE'.


### Load Model

In [27]:
model_version = 1
# model_uri = f"models:/{model_name}/{model_version}"
model_uri = f"models:/{model_name}@challenger"

loaded_model = mlflow.xgboost.load_model(model_uri)
y_pred = loaded_model.predict(X_test)
y_pred[:5]

array([0, 0, 0, 0, 0])

### Transition Model to Production

In [28]:
dev_model_uri = f"models:/{model_name}@challenger"
prod_model = "anomaly-detection-prod"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=dev_model_uri, dst_name=prod_model)

Successfully registered model 'anomaly-detection-prod'.
Copied version '1' of model 'XGB-SMOTE' to version '1' of model 'anomaly-detection-prod'.


<ModelVersion: aliases=[], creation_timestamp=1763285546999, current_stage='None', description='', last_updated_timestamp=1763285546999, name='anomaly-detection-prod', run_id='67323818faee4965844a9d9f54523585', run_link='', source='models:/XGB-SMOTE/1', status='READY', status_message='', tags={}, user_id='', version='1'>

In [29]:
model_uri = f"models:/{prod_model}@champion"

loaded_model = mlflow.xgboost.load_model(model_uri)
y_pred = loaded_model.predict(X_test)
y_pred[:5]

array([0, 0, 0, 0, 0])

In [30]:
loaded_model

0,1,2
,objective,'binary:logistic'
,base_score,'5E-1'
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


## DagsHub Setup

In [None]:
# import dagshub

# dagshub.init(repo_owner="AlainDeLong2k", repo_name="MLflow-DagsHub-Demo", mlflow=True)

# import mlflow

# with mlflow.start_run():
#     mlflow.log_param("parameter name", "value")
#     mlflow.log_metric("metric name", 1)

In [7]:
from dagshub.common.init import init

In [8]:
init(repo_owner="AlainDeLong2k", repo_name="MLflow-DagsHub-Demo", mlflow=True)

In [11]:
mlflow.set_experiment("Anomaly Detection")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_metrics(
            {
                "accuracy": report["accuracy"],
                "recall_class_1": report["1"]["recall"],
                "recall_class_0": report["0"]["recall"],
                "f1_score_macro": report["macro avg"]["f1-score"],
            }
        )

        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

2025/11/16 21:38:47 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.
2025/11/16 21:39:04 INFO mlflow.tracking._tracking_service.client: üèÉ View run Logistic Regression at: https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/#/experiments/0/runs/d3d66b82002a4b45a45d6c031bacff26.
2025/11/16 21:39:04 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/#/experiments/0.
2025/11/16 21:39:20 INFO mlflow.tracking._tracking_service.client: üèÉ View run Random Forest at: https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/#/experiments/0/runs/4ab3a5dc474d47a58bfc2a6c48ad2fb2.
2025/11/16 21:39:20 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/#/experiments/0.
2025/11/16 21:39:31 INFO mlflow.tracking._tracking_service.client: üèÉ View run XGBClassif

In [10]:
model_name = "xgboost_anomaly_detection"
model_version = 1

In [11]:
model_uri = f"models:/{model_name}/{model_version}"

loaded_model = mlflow.xgboost.load_model(model_uri)
loaded_model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

0,1,2
,objective,'binary:logistic'
,base_score,'5E-1'
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
mlflow.get_artifact_uri()

'mlflow-artifacts:/78ae8e7884b3460e8e71e15eb594cc4b/ff7e55e0997043df90306c8c49bf4646/artifacts'

In [14]:
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri="mlflow-artifacts:/78ae8e7884b3460e8e71e15eb594cc4b/ff7e55e0997043df90306c8c49bf4646/artifacts"
)

local_path

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

MlflowException: The following failures occurred while downloading one or more artifacts from https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/api/2.0/mlflow-artifacts/artifacts/78ae8e7884b3460e8e71e15eb594cc4b/ff7e55e0997043df90306c8c49bf4646:
##### File artifacts #####
API request to https://dagshub.com/AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/api/2.0/mlflow-artifacts/artifacts/78ae8e7884b3460e8e71e15eb594cc4b/ff7e55e0997043df90306c8c49bf4646/artifacts failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /AlainDeLong2k/MLflow-DagsHub-Demo.mlflow/api/2.0/mlflow-artifacts/artifacts/78ae8e7884b3460e8e71e15eb594cc4b/ff7e55e0997043df90306c8c49bf4646/artifacts (Caused by ResponseError('too many 500 error responses'))

In [15]:
local_uri = mlflow.artifacts.download_artifacts(
    run_id="f4612260943d4c189ee67af51ecb5539"
)

local_uri

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

'C:\\Users\\LONG\\AppData\\Local\\Temp\\tmp72639c4w\\'

In [16]:
dir(loaded_model)

['_Booster',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_is_fitted__',
 '__sklearn_tags__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_can_use_inplace_predict',
 '_configure_fit',
 '_create_dmatrix',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_iteration_range',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params_html',
 '_get_type',
 '_html_repr',
 '_load_model_attributes',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_evaluation_result',
 '_