In [0]:
%pip install databricks-feature-engineering matplotlib shap
dbutils.library.restartPython()

In [0]:
# import pandas as pd
# import pyspark.sql.functions as f
# from pyspark.ml.feature import StringIndexer

# url = "https://raw.githubusercontent.com/Ezapataq07/databricks-machine-learning-at-scale/refs/heads/main/machine-learning-at-scale-1.2.2/FeatureStore/diabetes.csv"

# pdf = pd.read_csv(url)
# pdf.columns = [col.replace('.','') for col in pdf.columns]
# df = spark.createDataFrame(pdf)

# df = df.withColumn('target', f.when(
#                         f.col('glyhb') >= 6.5, 1
#                         ). otherwise(
#                             0
#                         ))

# df = df.withColumn('gender', f.when(
#                                 f.col('gender') == 'female', 1
#                             ). otherwise(
#                                 0
# ))


# df.write.format('delta').mode('overwrite').saveAsTable('workspace.ml_training.diabetes_dataset')
# display(df)

In [0]:
# # Create the feature table
# from databricks.feature_engineering import FeatureEngineeringClient
# fe = FeatureEngineeringClient()

# fe.create_table(
#     name = 'workspace.ml_training.diabetes_features',
#     primary_keys = ['id'],
#     df = df.drop('target','location','frame'),
#     description = 'Diabetes features'
# )

In [0]:
df = spark.sql("SELECT * FROM workspace.ml_training.diabetes_dataset")

# Use the features table with MLFLow

import mlflow 

feature_dataset = mlflow.data.load_delta(table_name = 'workspace.ml_training.diabetes_features', name = 'diabetes_binary')

feature_data_pd = feature_dataset.df.join(df.select('id','target'),on='id',how='left').toPandas()
feature_data_pd = feature_data_pd.drop('id', axis=1)
feature_data_pd.head()

In [0]:
for column in feature_data_pd.columns:
    feature_data_pd[column] = feature_data_pd[column].astype('double')

print(feature_data_pd.dtypes)

In [0]:
# Train / test split 
from sklearn.model_selection import train_test_split

target_col = 'target'

X = feature_data_pd.drop(target_col, axis=1)
y = feature_data_pd[target_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)
                                                                         

In [0]:
# Fit and Log the model
dtc_params = {
    'criterion': 'gini',
    'max_depth': 50,
    'min_samples_split': 20,
    'min_samples_leaf': 5
}

# Register models in UC

mlflow.set_registry_uri('databricks-uc')

from math import sqrt

import mlflow
import mlflow.data
import mlflow.sklearn
from mlflow.models.signature import infer_signature

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# set the path for mlflow experiment
mlflow.set_experiment(f"/Workspace/Users/emanuel.zapata@datalytics.com/ModelTrackingMLFlow")

# turn off autologging
mlflow.sklearn.autolog(disable=True)
model_name = f"workspace.ml_training.diabetes_model"

# start an MLFlow run
with mlflow.start_run(run_name="Model Tracking Demo") as run:
    # log the dataset
    mlflow.log_input(feature_dataset, context="source")
    mlflow.log_input(mlflow.data.from_pandas(X_train, source=feature_dataset.source), context="training")
    mlflow.log_input(mlflow.data.from_pandas(X_test, source=feature_dataset.source), context="test")

    # log our parameters 
    mlflow. log_params(dtc_params)

    # fit our model
    dtc = DecisionTreeClassifier(**dtc_params)
    dtc_mdl=dtc.fit(X_train, y_train)

    # define model signiture
    signature = infer_signature(X, y)

    # log the model
    model_info = mlflow.sklearn.log_model(
        sk_model = dtc_mdl,
        artifact_path="model-artifacts",
        signature=signature,
        registered_model_name=model_name)

    # evaluate on the training set
    y_pred =dtc_mdl.predict(X_train)
    mlflow.log_metric("train_accuracy", accuracy_score(y_train, y_pred))
    mlflow.log_metric("train_precision", precision_score(y_train, y_pred) )
    mlflow.log_metric("train_recall", recall_score(y_train, y_pred) )
    mlflow.log_metric("train_f1", f1_score(y_train, y_pred))

    # evaluate on the test set
    y_pred = dtc_mdl.predict(X_test)
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred) )
    mlflow.log_metric("test_precision", precision_score(y_test, y_pred) )
    mlflow.log_metric("test_recall", recall_score(y_test, y_pred) )
    mlflow.log_metric("test_f1", f1_score(y_test, y_pred) )

    # MODEL EVALUATION (From Analytics Masters Degree)

    eval_data = X_test 
    eval_data[target_col] = y_test 

    mlflow.evaluate(
        model_info.model_uri,
        eval_data,
        targets = target_col,
        model_type = "classifier",
    )

In [0]:
# We can access all model details using the run.info class
run.info

# Log Model Artifacts
In addition to logging parameters, metrics, and the model itself, we can also log artifacts-any files or data relevant to the run. Let's set up an MLflow client to log artifacts after the run is completed.

In [0]:
from mlflow.client import MlflowClient

client = MlflowClient()

### Log Confusion Matrix

The confusion matrix is a useful tool to visualize the classification performance of the model. It provides insights into the true positive, true negative, false positive, and false negative predictions.

Let's create the confusion matrix and log it with MLflow using `log_figure` function.

In [0]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Computing the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[1, 0])

# Creating a figure object and axes for the confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))

# Plotting the confusion matrix using the created axes
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[1, 0])
disp.plot(cmap=plt.cm.Blues, ax=ax)

# Setting the title of the plot
ax.set_title('Confusion Matrix')

# Now 'fig' can be used with MLFlow's log_figure function
client.log_figure(run.info.run_id, figure=fig, artifact_file="confusion_matrix.png")

# Showing the plot here for demonstration
plt.show()

### Log Feature Importance

Now, let's examine and log the resulting model. We'll extract and plot the feature importances inferred from the Decision Tree model to understand which data features are most
critical for successful prediction.

Similar to the previous figure, we will use log_figure function.

In [0]:
import numpy as np

# Retrieving feature importances
feature_importances = dtc_mdl.feature_importances_
feature_names = X_train.columns.to_list()

# Plotting the feature importances
fig, ax = plt.subplots(figsize=(10, 6))
y_pos = np.arange(len(feature_names) )
ax.bar(y_pos, feature_importances, align='center', alpha=0.7)
ax. set_xticks(y_pos)
ax. set_xticklabels(feature_names, rotation=45)
ax.set_ylabel('Importance')
ax.set_title('Feature Importances in Decision Tree Classifier')

# log to mlflow
client.log_figure(run.info.run_id, figure=fig, artifact_file="feature_importances.png")

# display here
plt.show()

### Log Tree Structure

Decision trees make splitting decisions on different features at different critical values, and visualizing the tree structure helps us understand the decision logic. We'll plot the
branching tree structure for better interpretation.

We can get the tree in text format or as a graph. To log the text format we will use log_artifact function.

In [0]:
print(f"The fitted DecisionTreeClassifier model has {dtc_mdl.tree_.node_count} nodes and is up to {dtc_mdl.tree_.max_depth} levels deep.")

In [0]:
from sklearn.tree import export_text

text_representation = export_text(dtc_mdl, feature_names=feature_names)
print(text_representation)

# save this to a local file
tree_struct_filename = "tree_structure.txt"
with open(tree_struct_filename,'w') as f:
    f.write(text_representation)

# log it to mlflow
client.log_artifact(run.info.run_id, tree_struct_filename)

In [0]:
from sklearn.tree import plot_tree

# plot the tree structure
fig, ax = plt.subplots(figsize=(20,20))
plot_tree(dtc_mdl,
    feature_names=feature_names,
    max_depth=2,
    class_names=['0', '1'],
    filled=True,
    ax=ax)
ax.set_title('Decision Tree Structure')

# log it to mlflow
client.log_figure(run.info.run_id, fig, "decision_tree_structure.png")

# display it here
plt.show()