# Model Training

In [2]:
from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment

# Connnect to the AML Workspace

In [3]:
#connect to the workspace
ws = Workspace.from_config(".azure")

# get the compute target
compute_target = ws.compute_targets["cpu-cluster"]

# get the default datastore
datastore = ws.get_default_datastore()

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


# Import the training data

In [4]:
diabetes_ds = Dataset.get_by_name(ws, "diabetes_cleaned")
diabetes_df = diabetes_ds.to_pandas_dataframe()

In [5]:
diabetes_df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


# Train a Model with python

## Train DecisionTreeClassifier

In [6]:
%%writefile src/train_DTC.py


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from azureml.core import Workspace, Datastore, Dataset, Run
from azureml.core.resource_configuration import ResourceConfiguration
from azureml.core.model import Model
import pandas as pd
import os
import joblib
import sklearn

# get the current run
run = Run.get_context()
ws = run.experiment.workspace

datastore = ws.get_default_datastore()

# get the dataset
ds = Dataset.get_by_name(ws, "diabetes_cleaned")
diabetes_df = ds.to_pandas_dataframe()


X = diabetes_df.drop("class", axis=1)
y = diabetes_df["class"]

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = diabetes_df["class"], random_state=0)

# init the model
model = DecisionTreeClassifier()

# train the model
model.fit(X_train, y_train)

# get the predictions
y_pred = model.predict(X_test)


# register a sample_input
sample_input = diabetes_df[:1].drop("class", axis=1)
sample_input = Dataset.Tabular.register_pandas_dataframe(name="diabetes_sample_input",target=datastore, dataframe=sample_input)

# register a sample_output
sample_output = pd.DataFrame({"class": y_pred[:1]})
sample_output = Dataset.Tabular.register_pandas_dataframe(name="diabetes_sample_output",target=datastore, dataframe=sample_output)

acc = accuracy_score(y_test, y_pred)
# log the accuracy to the Run
run.log("acc", acc)

# log confusion matrix
cmtx = sklearn.metrics.confusion_matrix(y_test,(y_pred))

cmtx_wrapper =  {
       "schema_type": "confusion_matrix",
       "schema_version": "v1",
       "data": {
           "class_labels": diabetes_df["class"].unique().tolist(),
           "matrix": cmtx.tolist()
       }
   }
   
run.log_confusion_matrix("confusion matrix", cmtx_wrapper, description='')

# save the model to disk
joblib.dump(model, 'model.pkl')

# register the model in the workspace
model_reg = Model.register(model_path="model.pkl",
                       model_name="sklearn-model",
                       model_framework=Model.Framework.SCIKITLEARN,
                       model_framework_version=sklearn.__version__,
                       sample_input_dataset=sample_input,
                       sample_output_dataset=sample_output,
                       tags={'area': "diabetes", 'type': "classification"},
                       resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                       description="DecisionTreeClassifier for diabetes dataset",
                       workspace=ws)

Overwriting src/train_DTC.py


## Define the Run Configuration

In [7]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.graph import PipelineParameter

from azureml.pipeline.core import Pipeline
from azureml.core.runconfig import RunConfiguration
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.environment import CondaDependencies

from azureml.core import ScriptRunConfig

# create a new runconfig object
run_config = RunConfiguration()

# # enable Docker 
run_config.environment.docker.enabled = True

# # set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# # # use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = True

# # specify CondaDependencies obj
run_config.environment = Environment.from_conda_specification(name = "train-env", file_path = "environment.yml")


## Create the Python script step

In [8]:

train_DTC = PythonScriptStep(
    script_name="train_DTC.py",
    source_directory="src",
    compute_target="cpu-cluster",
    runconfig=run_config)

model_train_pipe = Pipeline(workspace=ws, steps=[train_DTC])

## Validate the Pipeline Configuration

In [9]:
model_train_pipe.validate()
print("Pipeline validation complete")

Step train_DTC.py is ready to be created [6837fb8d]
Pipeline validation complete


## Run the Pipeline

In [10]:
pipeline_run = Experiment(ws, 'training').submit(model_train_pipe)
pipeline_run.wait_for_completion(show_output=True)

Created step train_DTC.py [6837fb8d][b03d5904-f257-48bd-9ad4-2b48e95e29d8], (This step will run and generate new outputs)
Submitted PipelineRun 67984c12-e3ab-4e6a-a28a-4247949af128
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/training/runs/67984c12-e3ab-4e6a-a28a-4247949af128?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRunId: 67984c12-e3ab-4e6a-a28a-4247949af128
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/training/runs/67984c12-e3ab-4e6a-a28a-4247949af128?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRun Status: NotStarted
PipelineRun Status: Running
Expected a StepRun object but received <class 'azureml.core.run.Run'> instead.
This usually indicates a package conflict with one of the dependencies of azureml-core or azureml-pipeline-core.
Please check for package conflicts in your python environment




'Finished'

********

# Train Multiple Models in the Pipeline

## Define the steps

In [11]:
# DecisionTreeClassifier
train_DTC = PythonScriptStep(
    script_name="train_DTC.py",
    source_directory="src",
    compute_target="cpu-cluster",
    runconfig=run_config)

# KNeighborsClassifier
train_KNC = PythonScriptStep(
    script_name="train_KNC.py",
    source_directory="src",
    compute_target="cpu-cluster",
    runconfig=run_config)

# RandomForestClassifier
train_RFC = PythonScriptStep(
    script_name="train_RFC.py",
    source_directory="src",
    compute_target="cpu-cluster",
    runconfig=run_config)


multi_model_train_pipe = Pipeline(workspace=ws, steps=[train_DTC, train_KNC, train_RFC])

## Run the Pipelines

In [12]:
pipeline_run = Experiment(ws, 'training').submit(multi_model_train_pipe)
pipeline_run.wait_for_completion(show_output=True)

Created step train_DTC.py [3ad555df][b03d5904-f257-48bd-9ad4-2b48e95e29d8], (This step is eligible to reuse a previous run's output)
Created step train_KNC.py [34cee30b][07c2a78b-8a81-4f94-9b5e-0bb505dea832], (This step will run and generate new outputs)
Created step train_RFC.py [6a2fac9d][4fd73541-dc02-43f5-a922-86ea561f4ede], (This step will run and generate new outputs)
Submitted PipelineRun 3058143a-7e0e-4fb6-a1c8-9714fd395ff6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/training/runs/3058143a-7e0e-4fb6-a1c8-9714fd395ff6?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRunId: 3058143a-7e0e-4fb6-a1c8-9714fd395ff6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/training/runs/3058143a-7e0e-4fb6-a1c8-9714fd395ff6?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRun Status: NotStarted
PipelineRun Status: Running
E

'Finished'