# <span style="color:#1f77b4">**Machine Learning 02 - MLflow**</span>


### Unity Catalog storage setup


In [None]:
# Unity Catalog config for this project
dbutils.widgets.removeAll()
dbutils.widgets.text("CATALOG", "")
dbutils.widgets.text("SCHEMA", "default")
dbutils.widgets.text("VOLUME", "ml_lab")

catalog_widget = dbutils.widgets.get("CATALOG")
if catalog_widget:
    CATALOG = catalog_widget
else:
    # Prefer current catalog, otherwise pick the first non-system catalog
    current = spark.sql("SELECT current_catalog()").first()[0]
    catalogs = [r.catalog for r in spark.sql("SHOW CATALOGS").collect()]
    CATALOG = current if current not in ("system",) else next(c for c in catalogs if c not in ("system",))

SCHEMA = dbutils.widgets.get("SCHEMA")
VOLUME = dbutils.widgets.get("VOLUME")
BASE = f"dbfs:/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"


In [None]:
# Ensure schema and volume exist
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")


### <span style="color:#1f77b4">**Configuring MLflow temp storage**</span>


In [None]:
# Use UC volume for MLflow temp artifacts to avoid DBFS root
import os
import mlflow

base_local = "/dbfs" + BASE.replace("dbfs:", "")
mlflow_tmp = base_local + "/mlflow_tmp"
dbutils.fs.mkdirs(BASE + "/mlflow_tmp")
os.environ["MLFLOW_TMP_DIR"] = mlflow_tmp

# Store MLflow artifacts in the UC volume
mlflow_artifacts = BASE + "/mlflow_artifacts"
experiment_name = "/Shared/machine-learning/diabetes-mlflow"
try:
    exp_id = mlflow.create_experiment(experiment_name, artifact_location=mlflow_artifacts)
except Exception:
    exp = mlflow.get_experiment_by_name(experiment_name)
    exp_id = exp.experiment_id if exp else None
if exp_id:
    mlflow.set_experiment(experiment_name)


### <span style="color:#1f77b4">**Loading CSV Dataset into the Databricks File System (DBFS)**</span>


In [None]:
# Sync raw data files into the UC volume
data_dir = f"{BASE}/diabetes"
dbutils.fs.rm(data_dir, recurse=True)
dbutils.fs.mkdirs(data_dir)
dbutils.fs.cp("https://raw.githubusercontent.com/Ch3rry-Pi3-Azure/DataBricks-Machine-Learning/refs/heads/main/data/diabetes.csv", f"{BASE}/diabetes/diabetes.csv")


### <span style="color:#1f77b4">**Splitting Dataset into Training and Testing sets**</span>


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = spark.read.format("csv").option("header", "true").load(BASE + "/diabetes/diabetes.csv")
data = data.dropna().select(col("Pregnancies").astype("int"),
                           col("Glucose").astype("int"),
                          col("BloodPressure").astype("int"),
                          col("SkinThickness").astype("int"),
                          col("Insulin").astype("int"),
                          col("BMI").astype("float"),
                          col("DiabetesPedigreeFunction").astype("float"),
                          col("Age").astype("int"),
                          col("Outcome").astype("int")
                          )

   
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())


### <span style="color:#1f77b4">**Creating an MLflow Experiment Function**</span>


In [None]:
def train_diabetes_model(training_data, test_data, maxIterations, regularization):
    import mlflow
    import mlflow.spark
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    import time
    
    # Start an MLflow run  
    with mlflow.start_run():
        numFeatures = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]

        # define feature engineering and model steps

        numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
        numScaler = MinMaxScaler(inputCol=numVector.getOutputCol(), outputCol="normalizedFeatures")
        featureVector = VectorAssembler(inputCols=["normalizedFeatures"], outputCol="features")
        algo = LogisticRegression(labelCol="Outcome", featuresCol="features", maxIter=maxIterations, regParam=regularization)

        # chain the steps as stages in a Pipeline
        Pipeline = Pipeline(stages=[numVector, numScaler,featureVector,algo])

        # Log training parameter values
        print ("Training Logistic Regression model...")
        mlflow.log_param('maxIter', algo.getMaxIter())
        mlflow.log_param('regParam', algo.getRegParam())
        model = Pipeline.fit(training_data)
   
        # Evaluate the model and log metrics
        prediction = model.transform(test_data)
        metrics = ["accuracy", "weightedRecall", "weightedPrecision"]
        for metric in metrics:
            evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName=metric)
            metricValue = evaluator.evaluate(prediction)
            print("%s: %s" % (metric, metricValue))
            mlflow.log_metric(metric, metricValue)
   
   
        # Log the model itself
        unique_model_name = "classifier-" + str(time.time())
        mlflow.spark.log_model(model, unique_model_name, mlflow.spark.get_default_conda_env())
        modelpath = BASE + "/models/%s" % (unique_model_name)
        mlflow.spark.save_model(model, modelpath)
   
        print("Experiment run complete.")


### <span style="color:#1f77b4">**Calling our MLflow experiment function with different hyperparameters**</span>


In [None]:
train_diabetes_model(train, test, 5, 0.5)


In [None]:
train_diabetes_model(train, test, 10, 0.2)


### <span style="color:#1f77b4">**Testing our registered model via browser based endpoint**</span>


In [None]:
{
   "dataframe_records": [
   {
      "Pregnancies": 8,
      "Glucose": 85,
      "BloodPressure": 65,
      "SkinThickness": 29,
      "Insulin": 0,
      "BMI": 26.6,
      "DiabetesPedigreeFunction": 0.672,
      "Age": 34
   }
   ]
 }
