# Azure Machine Learning Pipeline with AutoMLStep
This notebook demonstrates the use of AutoMLStep in Azure Machine Learning Pipeline.

## Introduction
1. Create an `Experiment` in an existing `Workspace`.
2. Create or Attach existing AmlCompute to a workspace.
3. Define data loading in a `TabularDataset`.
4. Configure AutoML using `AutoMLConfig`.
5. Use AutoMLStep
6. Train the model using AmlCompute
7. Explore the results.
8. Test the best fitted model.

## Azure Machine Learning and Pipeline SDK-specific imports

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("Azure ML Ptython SDK version:", azureml.core.VERSION)

Azure ML Ptython SDK version: 1.59.0


## Initialize Workspace
Initialize a workspace object from persisted configuration. Make sure the config file is present at .\config.json

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Create an Azure ML experiment


In [None]:
# Choose a name for the run history container in the workspace.
experiment_name = 'automljob_mqtt_prediction'
# project folder used to explicitly store the script and data files. It can be used with AutoMLConfig. But when using the pipeline, it is not necessary to specify the project folder. The pipeline will automatically create a folder for the pipeline job. 
project_folder = './project'

experiment = Experiment(ws, experiment_name)
experiment

## Create or Attach an AmlCompute cluster


In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
amlcompute_cluster_name = "Marposs"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
    print("Compute target:", amlcompute_cluster_name)
    
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2',# for GPU, use "Standard_NC6s_v3"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.get_status()
# For a more detailed view of current AmlCompute status, use get_status().

## Data Ingestion

In [None]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
found = False
key = "Marposstestdevice3"
description_text = "Marposstestdevice dataset"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        # The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.
        new_data = 'https://iotappstorage1.blob.core.windows.net/telemetry-azaz-phd/new_data.csv'
        dataset = Dataset.Tabular.from_delimited_files(new_data)
        #dataset = dataset.drop_columns(['FBI Code'])
        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
#df.describe()
df.head(5)

## AutoML configurations to Start AutoML job
This creates a general AutoML settings object.

In [None]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'r2_score'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "regression",
                             training_data=dataset,
                             label_column_name="K2001",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )


## Create an AutoMLStep.

In [7]:
from azureml.pipeline.core import PipelineData, TrainingOutput

ds = ws.get_default_datastore()
metrics_output_name = 'metrics_output'
best_model_output_name = 'best_model_output'

metrics_data = PipelineData(name='metrics_data',
                           datastore=ds,
                           pipeline_output_name=metrics_output_name,
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='model_data',
                           datastore=ds,
                           pipeline_output_name=best_model_output_name,
                           training_output=TrainingOutput(type='Model'))

In [8]:
automl_step = AutoMLStep(
    name='automljob_mqtt_prediction',
    automl_config=automl_config,
    outputs=[metrics_data, model_data],
    allow_reuse=True)

In [9]:
from azureml.pipeline.core import Pipeline
pipeline = Pipeline(description="automl_pipleline_mqtt_prediction", workspace=ws, steps=[automl_step])

## Running a Pipeline

In [10]:
pipeline_run = experiment.submit(pipeline)
pipeline_run.wait_for_completion()

Created step automljob_mqtt_prediction [98a5111e][33876154-8463-4528-9915-97d85cb5f206], (This step will run and generate new outputs)
Submitted PipelineRun cffff284-594b-465c-a0f1-68f8165b31fa
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cffff284-594b-465c-a0f1-68f8165b31fa?wsid=/subscriptions/fc011c7b-8150-4065-af8b-1a8487bc3f73/resourcegroups/Muhammad-Azaz-Farooq/workspaces/trainmodels&tid=e99647dc-1b08-454a-bf8c-699181b389ab
PipelineRunId: cffff284-594b-465c-a0f1-68f8165b31fa
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cffff284-594b-465c-a0f1-68f8165b31fa?wsid=/subscriptions/fc011c7b-8150-4065-af8b-1a8487bc3f73/resourcegroups/Muhammad-Azaz-Farooq/workspaces/trainmodels&tid=e99647dc-1b08-454a-bf8c-699181b389ab
PipelineRun Status: Running


StepRunId: db089962-733e-4449-bc80-b6c02cb36121
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/db089962-733e-4449-bc80-b6c02cb36121?wsid=/subscriptions/fc011c7b-8150-4065-af8b-1a8487bc3f73

'Finished'

In [None]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

## Downloading Artifacts (.pkl, .yaml, and scoring files)

In [11]:
from azureml.train.automl.run import AutoMLRun

# Step 1: Get the AutoML step run
automl_step_run = pipeline_run.find_step_run('automljob_mqtt_prediction')[0]

# Step 2: Get the child AutoML run
#automl_child_run_id = automl_step_run.properties['RunId']
automl_child_run = AutoMLRun(experiment, run_id=automl_step_run.id)

# Step 3: Extract best model and download artifacts
best_run, fitted_model = automl_child_run.get_output()
# Set output directory
output_dir = 'artifacts/automljob_mqtt_prediction'
os.makedirs(output_dir, exist_ok=True)

# Download entire outputs folder at once
best_run.download_files(prefix='outputs/', output_directory=output_dir, append_prefix=True)

print("Model's Artifacts are downloaded to:", output_dir)


  from pandas import MultiIndex, Int64Index


Model's Artifacts are downloaded to: artifacts/automljob_mqtt_prediction


## Examine Results

### Retrieve the metrics of all child runs
Outputs of above run can be used as inputs of other steps in pipeline.

In [12]:
metrics_output = pipeline_run.get_pipeline_output(metrics_output_name)
num_file_downloaded = metrics_output.download('.', show_progress=True)

Downloading azureml/db089962-733e-4449-bc80-b6c02cb36121/metrics_data
Downloaded azureml/db089962-733e-4449-bc80-b6c02cb36121/metrics_data, 1 files out of an estimated total of 1


In [13]:
import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)

### Retrieve the Best Model

In [14]:
# Retrieve best model from Pipeline Run
best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)
num_file_downloaded = best_model_output.download('.', show_progress=True)

Downloading azureml/db089962-733e-4449-bc80-b6c02cb36121/model_data
Downloaded azureml/db089962-733e-4449-bc80-b6c02cb36121/model_data, 1 files out of an estimated total of 1


In [None]:
import pickle
import os

model_path = os.path.join("directory/path whrere model.pkl is stored", "model.pkl") 
with open(model_path, "rb") as f:
    best_model = pickle.load(f)

print(best_model)    

RegressionPipeline(pipeline=Pipeline(steps=[('datatransformer',
                                             DataTransformer(enable_dnn=False, enable_feature_sweeping=True, is_cross_validation=True, task='regression', working_dir='d:\\SecondPhase-Marposs(10-3-25)\\Deployment-Codes-2025\\AutoML trigger')),
                                            ('MaxAbsScaler', MaxAbsScaler()),
                                            ('XGBoostRegressor',
                                             XGBoostRegressor(problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), tree_method='auto'))]),
                   stddev=[0.00011101743028355996])


In [16]:
import pickle

with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

In [17]:
best_model.steps

[('datatransformer',
  DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, task='regression')),
 ('MaxAbsScaler', MaxAbsScaler()),
 ('XGBoostRegressor',
  XGBoostRegressor(n_jobs=1, problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_state=0, tree_method='auto'))]

## pkl Model Registration

In [None]:
from azureml.core.model import Model

model = Model.register(
    workspace=ws,
    model_path= "directory/path whrere model.pkl is stored/model.pkl",  # The path to the model file
    model_name="XGBoost_regressor",
    tags={"type": "XGBoost_regressor_mqtt_prediction", "framework": "pkl"}
)

print("Model registered successfully!")
print(f"Model Name: {model.name}, Version: {model.version}")

Registering model XGBoost_regressor
Model registered successfully!
Model Name: XGBoost_regressor, Version: 2


## Convert pkl to onnx

In [None]:

# Load the AutoML trained model
with open(model_path, "rb") as f:
    automl_model = pickle.load(f)

print(f"Loaded model type: {type(automl_model)}")

Loaded model type: <class 'azureml.training.tabular.models.forecasting_pipeline_wrapper.RegressionPipeline'>


In [21]:
if hasattr(automl_model, "steps"):
    print("Model is a pipeline. Extracting the final step.")
    fitted_model = automl_model.steps[-1][1]  # Extract the last step (ML model)
else:
    print("Model is NOT a pipeline. Using it directly.")
    fitted_model = automl_model

print(f"Final model type: {type(fitted_model)}")

Model is a pipeline. Extracting the final step.
Final model type: <class 'azureml.automl.runtime.shared.model_wrappers.XGBoostRegressor'>


In [22]:
# Unwrap the native XGBoost model
native_model = fitted_model.model  

# Confirm it's an XGBRegressor or XGBClassifier
print(type(native_model))

<class 'xgboost.sklearn.XGBRegressor'>


In [None]:
#!pip install onnxmltools
#!pip install xgboost==2.1.4

In [28]:
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

# This is your native XGBRegressor model (unwrapped from AutoML)
# Make sure it's trained
print(type(native_model))  

# Define input type
initial_type = [("input", FloatTensorType([None, 5]))]

# Convert to ONNX
onnx_model = convert_xgboost(native_model.get_booster(), initial_types=initial_type)

# Save to file
onnx_model_path = "xgboostmodel_mqtt_prediction.onnx"
with open("xgboostmodel_mqtt_prediction.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("ONNX model saved as {onnx_model_path}")


<class 'xgboost.sklearn.XGBRegressor'>
ONNX model saved as {onnx_model_path}


## Validation

In [None]:
import onnx

# Load and check the ONNX model
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model)

print("ONNX model is valid!")

## Model Registration

In [None]:
from azureml.core.model import Model

model = Model.register(
    workspace=ws,
    model_path="directory/path whrere model.onnx is stored/xgboostmodel_mqtt_prediction.onnx",
    model_name="XGBoost_regressor_mqtt_prediction",
    tags={"type": "XGBoost_regressor_mqtt_prediction", "framework": "ONNX"},
    description="XGBoost_regressor_mqtt_prediction model converted to ONNX"
)

print("Model registered successfully!")
print(f"Model Name: {model.name}, Version: {model.version}")

Registering model XGBoost_regressor_mqtt_prediction
Model registered successfully!
Model Name: XGBoost_regressor_mqtt_prediction, Version: 1
