# Retrieve the feature set from Azure ML managed feature store

In [None]:
%run feature_set_retrieval

In [None]:
transformed_df = df.toPandas()
transformed_df.head(5)

# Track Machine Learning experiments and models

A machine learning model is a file that has been trained to recognize certain types of patterns. You train a model over a set of data, providing it an algorithm that it can use to reason over and learn from those data. Once you have trained the model, you can use it to reason over data that it hasn't seen before, and make predictions about that data.

In this notebook, you will learn the basic steps to run an experiment, add a model version to track run metrics and parameters and register a model.


In [None]:
import mlflow

# Set given experiment as the active experiment. If an experiment with this name does not exist, a new experiment with this name is created.
ml_experiment_name = "training-experiment"
mlflow.set_experiment(ml_experiment_name)


In [None]:
from pprint import pprint
from mlflow import MlflowClient

def yield_artifacts(run_id, path=None):
    """Yield all artifacts in the specified run"""
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path


def fetch_logged_data(run_id):
    """Fetch params, metrics, tags, and artifacts in the specified run"""
    client = MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }

In [None]:
import numpy as np 
from mlflow.models import infer_signature
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Fill None values with 0
final_df = (
    transformed_df.drop(["pickup_timestamp"], axis=1, errors="ignore").fillna(0)
)

seed = 13
X_train, X_test, y_train, y_test = train_test_split(
    final_df.drop(["scaled_demand"], axis=1),
    final_df["scaled_demand"],
    test_size=0.2, 
    random_state=seed
)

params = {
    "C": 10,
    "kernel": 'rbf',
    "degree": 8,
    "gamma": 'auto',
    "tol": 0.0001
}

In [None]:
with mlflow.start_run() as run:
    clf = svm.SVR(**params).fit(X_train, y_train)

    # Predict on the validation set
    y_pred = clf.predict(X_test)

    # Calculate error metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Assemble the metrics we're going to write into a collection
    metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}
    
    signature = infer_signature(X_test, y_test)

    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Activate the MLFlow logging API to log your model artifacts
    mlflow.sklearn.log_model(clf, "demand_prediction_model", signature=signature)
    print("Model saved in run_id=%s" % run.info.run_id)

    # Register the model produced from your training job.
    mv = mlflow.register_model(
        "runs:/{}/demand_prediction_model".format(run.info.run_id), "demand_prediction_model"
    )

In [None]:
run_id = mlflow.last_active_run().info.run_id
print(f"Logged data and model in run: {run_id}")

# show logged data
for key, data in fetch_logged_data(run_id).items():
    print(f"\n---------- logged {key} ----------")
    pprint(data)

## Register model training lineage to Purview

In [None]:
%run data_catalog_and_lineage

In [None]:
purview_data_catalog = PurviewDataCatalog()

# Create features assets
fset = featurestore.feature_sets.get(featureset_name, all_featuresets[featureset_name])
target_features = final_df.columns
feature_assets = purview_data_catalog.prepare_feature_assets(featurestore_name,
                                                             fset,
                                                             target_features,
                                                             **{"tenant_id": tenant_id,
                                                              "subscription_id": featurestore_subscription_id,
                                                              "resource_group": featurestore_resource_group_name})
# Prepare AML custom types
purview_data_catalog.prepare_aml_custom_types()

# Create model training notebook process asset
current_notebook_context = mssparkutils.notebook.nb.context
workspace_id = current_notebook_context["currentWorkspaceId"]
notebook_id = current_notebook_context["currentNotebookId"]
notebook_name = "model_training"
process_data_asset = DataAsset(f"{notebook_name} (Fabric notebook)",
                               "process",
                               f"https://{fabric_tenant}.powerbi.com/groups/{workspace_id}/synapsenotebooks/{notebook_id}")

# Create Azure ML experiment asset
ml_experiment_run = mlflow.get_run(mv.run_id).to_dictionary()
ml_artifact_uri = ml_experiment_run["info"]["artifact_uri"]
ml_experiment_id = ml_artifact_uri.split(f"{workspace_id}/")[1].split("/")[0]
ml_experiment_fqn = f"https://msit.powerbi.com/groups/{workspace_id}/mlexperiments/{ml_experiment_id}?experience=data-science"

ml_experiment_asset = DataAsset(ml_experiment_name,
                                "ml_experiment",
                                ml_experiment_fqn)

# Register lineage like features -> model training notebook -> ML experiment
training_model_lineage = DataLineage(input_data_assets=feature_assets,
                                     output_data_assets=[ml_experiment_asset],
                                     process_asset=process_data_asset)
purview_data_catalog.register_lineage(training_model_lineage)


In [None]:
# Create Azure ML model asset
ml_model_id = mv.source.split(".dfs.core.windows.net/")[1].split("/")[0]
ml_model_name = mv.name
ml_experiment_run = mlflow.get_run(mv.run_id).to_dictionary()
ml_experiment_run_name = ml_experiment_run["data"]["tags"]["mlflow.runName"]

ml_model_fqn = f"https://msit.powerbi.com/groups/{workspace_id}/mlmodels/{ml_model_id}?experience=data-science"
ml_model_asset = DataAsset(ml_model_name,
                           "ml_model",
                           ml_model_fqn,
                           custom_properties={"version": mv.version,
                                              "experimentRunName": ml_experiment_run_name},
                           relationship_attributes=[{"type": "sources",
                                                     "qualified_name": ml_experiment_fqn}])

# Register ML model entity connecting to ML experiment, without process node
purview_data_catalog.register_entity(ml_model_asset)
