Based on: [https://github.com/MicrosoftLearning/mslearn-azure-ml/blob/main/Labs/07/Track%20model%20training%20with%20MLflow.ipynb]

In [None]:
# Import the required libraries
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# The workspace information from the previous experiment has been pre-filled for you.
subscription_id = "your-subscription-id"
resource_group = "ml-corp-test-rg"
workspace_name = "ml-corp-test-ws"

credential = DefaultAzureCredential()
ml_client = MLClient(credential, subscription_id, resource_group, workspace_name)
workspace = ml_client.workspaces.get(name=ml_client.workspace_name)
print(ml_client.workspace_name, workspace.resource_group, workspace.location, ml_client.connections._subscription_id, sep = '\n')

ml-workspace-corp-test-swn-001
rg-corp-data-ai-platform-001
switzerlandnorth
7dae9e0f-de34-4921-91be-67945119f760


In [None]:
import pandas as pd

print("Reading data...")
df = pd.read_csv('./diabetes-data/diabetes.csv')
df.head()

Create data frame, X and Y data for Train and split Data for Test

In [6]:
from sklearn.model_selection import train_test_split

print("splitting data and train")

X, y = df[
    ['Pregnancies',
    'PlasmaGlucose',
    'DiastolicBloodPressure',
    'TricepsThickness',
    'SerumInsulin',
    'BMI',
    'DiabetesPedigree',
    'Age']].values, df['Diabetic'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

splitting data and train


In [None]:
import mlflow
experiment_name = "mlflow-experiment-diabetes"
mlflow.set_experiment(experiment_name) 

In [11]:
from sklearn.linear_model import LogisticRegression

with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)



üèÉ View run frank_collar_2h3hns87 at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/7dae9e0f-de34-4921-91be-67945119f760/resourceGroups/rg-corp-data-ai-platform-001/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace-corp-test-swn-001/#/experiments/80412d61-5674-481f-8d7e-1aea6a2610ff/runs/87d12a4f-279c-4584-aef8-be800ae1105e
üß™ View experiment at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/7dae9e0f-de34-4921-91be-67945119f760/resourceGroups/rg-corp-data-ai-platform-001/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace-corp-test-swn-001/#/experiments/80412d61-5674-481f-8d7e-1aea6a2610ff


### Custom Model Tracking part, without autolog

In [12]:
mlflow.sklearn.autolog(disable=True)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

with mlflow.start_run():
    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    mlflow.log_param("regularization_rate", 0.1)
    mlflow.log_metric("Accuracy", acc)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

with mlflow.start_run():
    model = LogisticRegression(C=1/0.01, solver="liblinear").fit(X_train, y_train)

    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    mlflow.log_param("regularization_rate", 0.01)
    mlflow.log_metric("Accuracy", acc)
     

In [15]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

with mlflow.start_run():
    model = DecisionTreeClassifier().fit(X_train, y_train)

    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    mlflow.log_param("estimator", "DecisionTreeClassifier")
    mlflow.log_metric("Accuracy", acc)

üèÉ View run jovial_horse_2pdm2j62 at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/7dae9e0f-de34-4921-91be-67945119f760/resourceGroups/rg-corp-data-ai-platform-001/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace-corp-test-swn-001/#/experiments/80412d61-5674-481f-8d7e-1aea6a2610ff/runs/af9304c0-c20c-46bb-82ba-8689724fa684
üß™ View experiment at: https://switzerlandnorth.api.azureml.ms/mlflow/v2.0/subscriptions/7dae9e0f-de34-4921-91be-67945119f760/resourceGroups/rg-corp-data-ai-platform-001/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace-corp-test-swn-001/#/experiments/80412d61-5674-481f-8d7e-1aea6a2610ff


In [None]:
 pip install xgboost

In [None]:
Custom Part: Uses XGBClassifier alghorithm to train Classification model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBClassifier

with mlflow.start_run():
    # model = DecisionTreeClassifier().fit(X_train, y_train)
    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss").fit(X_train, y_train)


    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)

    # plot ROC curve
    y_scores = model.predict_proba(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig("ROC-Curve.png")

    mlflow.log_param("estimator", "DecisionTreeClassifier")
    mlflow.log_metric("Accuracy", acc)
    mlflow.log_artifact("ROC-Curve.png")

Custom Training Script that saves and shows predictions right a way.

In [None]:
import pandas as pd
import numpy as np
import mlflow
import os
from sklearn.linear_model import LogisticRegression

# --- assume we already have:
# model = LogisticRegression(...).fit(X_train, y_train)

with mlflow.start_run():
    mlflow.sklearn.autolog()
    model = LogisticRegression(C=1/0.1, solver="liblinear").fit(X_train, y_train)

    # ---------- PREDICTIONS ----------
    # choose the dataset you want predictions for (X_test/y_test or X_train/y_train)
    X_eval = X_test      # change if you want predictions for train set
    y_true = y_test

    # Convert X_eval to DataFrame (keep feature names if available)
    if isinstance(X_eval, pd.DataFrame):
        df_X = X_eval.copy()
    else:
        # if X_train was a DataFrame, reuse its columns; otherwise create numeric names
        if isinstance(X_train, pd.DataFrame):
            cols = X_train.columns
        else:
            cols = [f"feature_{i}" for i in range(np.shape(X_eval)[1])]
        df_X = pd.DataFrame(X_eval, columns=cols)

    # Predictions and probabilities
    y_pred = model.predict(X_eval)
    # predict_proba might not exist for some classifiers ‚Äî handle carefully
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_eval)
        # for binary, take probability of class 1 as a convenience column:
        if probs.shape[1] == 2:
            prob_pos = probs[:, 1]
            df_probs = pd.DataFrame({"prob_neg": probs[:, 0], "prob_pos": probs[:, 1]})
        else:
            # multiclass: create prob_class_<label> columns
            class_labels = model.classes_
            df_probs = pd.DataFrame(probs, columns=[f"prob_class_{c}" for c in class_labels])
    else:
        df_probs = pd.DataFrame()  # empty if not available

    # Build result table
    df_results = df_X.reset_index(drop=True).copy()
    df_results["y_true"] = np.array(y_true).reshape(-1)
    df_results["y_pred"] = np.array(y_pred).reshape(-1)
    if not df_probs.empty:
        df_results = pd.concat([df_results, df_probs.reset_index(drop=True)], axis=1)

    # Optional: add prediction score (decision_function) if available
    if hasattr(model, "decision_function"):
        try:
            df_results["score"] = model.decision_function(X_eval)
        except Exception:
            pass

    # ---------- SAVE & LOG ----------
    out_dir = "predictions"
    os.makedirs(out_dir, exist_ok=True)
    csv_path = os.path.join(out_dir, "predictions_table.csv")
    df_results.to_csv(csv_path, index=False)

    # Try mlflow.log_table if available (MLflow >= 2.0). Otherwise log CSV as artifact.
    try:
        if hasattr(mlflow, "log_table"):
            mlflow.log_table(df_results, artifact_file="predictions_table")  # stores table in run
        else:
            raise AttributeError
    except Exception:
        mlflow.log_artifact(csv_path, artifact_path="predictions")

    # ---------- QUICK METRICS (optional) ----------
    from sklearn.metrics import classification_report, confusion_matrix
    report = classification_report(y_true, y_pred, output_dict=True)
    # Log a simple numeric metric example: accuracy
    accuracy = report.get("accuracy")
    if accuracy is not None:
        mlflow.log_metric("accuracy", float(accuracy))

    # show a preview (useful in notebooks)
    print(df_results.head())


In [None]:
import mlflow

print("experiments")
experiments = mlflow.search_experiments(max_results=10)
for exp in experiments:
    print(exp.name)

print("search runs")
mlflow.search_runs(exp.experiment_id)