In [None]:
# Import all necessary modules at the top
import os
import sys
from pprint import pprint

# Add the parent directory to the path to import utils
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import mlflow
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRegressor
from mlflow.models import infer_signature
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Fabric-specific imports
try:
    from notebookutils import mssparkutils
except ImportError:
    # Fallback for environments where notebookutils is not available
    mssparkutils = None

# Import utility classes from our utils package
from utils import DataAsset, DataLineage, MockFeatureStore, PurviewDataCatalog, fetch_logged_data

# Retrieve the feature set from Azure ML managed feature store

In [None]:
%run feature_set_retrieval

# Load the NYC taxi dataset from previous feature retrieval
# This should be loaded from feature set retrieval results
df = spark.sql("SELECT * FROM nyctaxi_featureset LIMIT 1000")  # placeholder
nyctaxi_df = df.toPandas()
nyctaxi_df.head()

In [None]:
# Load the NYC weather dataset from previous feature retrieval
# This should be loaded from feature set retrieval results
nycweather_df_spark = spark.sql("SELECT * FROM nycweather_featureset LIMIT 1000")  # placeholder
nycweather_df = nycweather_df_spark.toPandas()  # type: ignore
nycweather_df = nycweather_df[nycweather_df["year"] == 2022].drop(columns=["year"])
nycweather_df.head()

nyctaxi_df = df.toPandas()
nyctaxi_df.head()

In [None]:
# Assuming nycweather_df is already defined and contains the relevant data

# Scaling NYC weather columns with minmax scaler
nycweather_scaler = MinMaxScaler()
nycweather_scaled_df = pd.DataFrame(nycweather_scaler.fit_transform(nycweather_df), columns=nycweather_df.columns)
nycweather_scaled_df.head()

In [None]:
# Scaling NYC weather columns with minmax scaler
nycweather_scaler = MinMaxScaler().fit(
    nycweather_df[["temperature_2m_c", "windspeed_10m_km_per_hour", "precipitation_mm", "cloudcover_percentage"]]
)

nycweather_df[["scaled_temperature", "scaled_windspeed", "scaled_precipitation", "scaled_cloudcover"]] = (
    nycweather_scaler.transform(
        nycweather_df[["temperature_2m_c", "windspeed_10m_km_per_hour", "precipitation_mm", "cloudcover_percentage"]]
    )
)

nycweather_df.head()

In [None]:
# Join NYC taxi and weather data
nyctaxi_with_weather_df = nyctaxi_df.join(
    nycweather_df.set_index(["month", "day", "hour"]), on=["month_pickup", "day_pickup", "hour_pickup"], how="inner"
)
nyctaxi_with_weather_df.head()

# Track Machine Learning experiments and models

A machine learning model is a file that has been trained to recognize certain types of patterns. You train a model over a set of data, providing it an algorithm that it can use to reason over and learn from those data. Once you have trained the model, you can use it to reason over data that it hasn't seen before, and make predictions about that data.

In this notebook, you will learn the basic steps to run an experiment, add a model version to track run metrics and parameters and register a model.


In [None]:
# Set given experiment as the active experiment.
# If an experiment with this name does not exist, a new experiment with this name is created.
ml_experiment_name = "training-experiment"
mlflow.set_experiment(ml_experiment_name)

In [None]:
# Combine the datasets for model training
# This assumes we have joined datasets from feature engineering
nyctaxi_with_weather_df = pd.concat([nyctaxi_df, nycweather_scaled_df], axis=1)

final_df = nyctaxi_with_weather_df.drop(columns=["pickup_datetime", "dropoff_datetime"])  # Remove datetime columns

# Split features and target
X = final_df.drop(columns=["demand"])  # Features
y = final_df["demand"]  # Target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "boosting_type": "gbdt",
    "importance_type": "split",
    "learning_rate": 0.1,
    "max_depth": 3,
    "n_estimators": 100,
    "n_jobs": -1,
    "num_leaves": 31,
    "objective": "poisson",
}

In [None]:
# Define infrastructure variables - these should be set from environment or previous cells
tenant_id = spark.conf.get("spark.fsd.tenant_id", "")
featurestore_subscription_id = spark.conf.get("spark.fsd.subscription_id", "")
featurestore_resource_group_name = spark.conf.get("spark.fsd.rg_name", "")
featurestore_name = spark.conf.get("spark.fsd.name", "")
fabric_tenant = spark.conf.get("spark.fsd.fabric.tenant", "")

# Feature set names
nyctaxi_featureset_name = "nyctaxi"
nycweather_featureset_name = "nycweather"

# Initialize feature store client and data catalog
featurestore = MockFeatureStore()

# Mock all_featuresets dict
all_featuresets = {nyctaxi_featureset_name: "1", nycweather_featureset_name: "1"}

purview_data_catalog = PurviewDataCatalog()

with mlflow.start_run() as run:
    model = LGBMRegressor(**params).fit(X_train, y_train)

    # Predict on the validation set
    y_pred = model.predict(X_test)

    # Calculate error metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Assemble the metrics we're going to write into a collection
    metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

    signature = infer_signature(X_test, y_test)

    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Activate the MLFlow logging API to log your model artifacts
    mlflow.sklearn.log_model(model, "demand_prediction_model", signature=signature)
    print(f"Model saved in run_id={run.info.run_id} with metrics {metrics}")

    # Register the model produced from your training job.
    mv = mlflow.register_model(f"runs:/{run.info.run_id}/demand_prediction_model", "demand_prediction_model")

In [None]:
run_id = mlflow.last_active_run().info.run_id
print(f"Logged data and model in run: {run_id}")

# show logged data
for key, data in fetch_logged_data(run_id).items():
    print(f"\n---------- logged {key} ----------")
    pprint(data)

## Register model training lineage to Purview

In [None]:
purview_data_catalog = PurviewDataCatalog()

# Create features assets
nyctaxi_fset = featurestore.feature_sets.get(nyctaxi_featureset_name, all_featuresets[nyctaxi_featureset_name])
nycweather_fset = featurestore.feature_sets.get(nycweather_featureset_name, all_featuresets[nycweather_featureset_name])
used_features = nyctaxi_with_weather_df.drop(columns=["demand"]).columns
infra_info = {
    "tenant_id": tenant_id,
    "subscription_id": featurestore_subscription_id,
    "resource_group": featurestore_resource_group_name,
}

nyctaxi_feature_assets = purview_data_catalog.prepare_feature_assets(
    featurestore_name, nyctaxi_fset, used_features, **infra_info
)

nycweather_feature_assets = purview_data_catalog.prepare_feature_assets(
    featurestore_name, nycweather_fset, used_features, **infra_info
)

# Prepare AML custom types
purview_data_catalog.prepare_aml_custom_types()

# Create model training notebook process asset
current_notebook_context = mssparkutils.notebook.nb.context
workspace_id = current_notebook_context["currentWorkspaceId"]
notebook_id = current_notebook_context["currentNotebookId"]
notebook_name = "model_training"
process_data_asset = DataAsset(
    f"{notebook_name} (Fabric notebook)",
    "process",
    f"https://{fabric_tenant}.powerbi.com/groups/{workspace_id}/synapsenotebooks/{notebook_id}",
)

# Create Azure ML experiment asset
ml_experiment_run = mlflow.get_run(mv.run_id).to_dictionary()
ml_artifact_uri = ml_experiment_run["info"]["artifact_uri"]
ml_experiment_id = ml_artifact_uri.split(f"{workspace_id}/")[1].split("/")[0]
ml_experiment_fqn = (
    f"https://msit.powerbi.com/groups/{workspace_id}/mlexperiments/{ml_experiment_id}?experience=data-science"
)

ml_experiment_asset = DataAsset(ml_experiment_name, "ml_experiment", ml_experiment_fqn)

# Register lineage like features -> model training notebook -> ML experiment
training_model_lineage = DataLineage(
    input_data_assets=nyctaxi_feature_assets + nycweather_feature_assets,
    output_data_assets=[ml_experiment_asset],
    process_asset=process_data_asset,
)
purview_data_catalog.register_lineage(training_model_lineage)

In [None]:
# Create Azure ML model asset
ml_model_id = mv.source.split(".dfs.core.windows.net/")[1].split("/")[0]
ml_model_name = mv.name
ml_experiment_run = mlflow.get_run(mv.run_id).to_dictionary()
ml_experiment_run_name = ml_experiment_run["data"]["tags"]["mlflow.runName"]

ml_model_fqn = f"https://msit.powerbi.com/groups/{workspace_id}/mlmodels/{ml_model_id}?experience=data-science"
ml_model_asset = DataAsset(
    ml_model_name,
    "ml_model",
    ml_model_fqn,
    custom_properties={"version": mv.version, "experimentRunName": ml_experiment_run_name},
    relationship_attributes=[{"type": "sources", "qualified_name": ml_experiment_fqn}],
)

# Register ML model entity connecting to ML experiment, without process node
purview_data_catalog.register_entity(ml_model_asset)