<a href="https://colab.research.google.com/github/FranciscoOcampoPredictiva/azureml_course/blob/main/Lecture_8_Automate_Model_Training_in_AzureML_SDK_(Pipeline_Run).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1) Installation and Setup

In [1]:
# Install azureml SDK package
! pip install -q azureml-sdk

In [2]:
# Import the class
from azureml.core import Workspace

In [3]:
# Access the workspace from config file and creating a workspace object
ws = Workspace.from_config(path='/content/config.json')

# 2) Running the Experiment

## Creating the custom environment

In [4]:
from azureml.core import Environment
from azureml.core.environment import CondaDependencies

In [5]:
myenv = Environment(name="MyEnvironment")
myenv.python.user_managed_dependencies = False
myenv.docker.enabled = True



In [6]:
# Creating the dependencies object
myenv_dep = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],
                                     pip_packages=['azureml-sdk'])
myenv.python.conda_dependencies = myenv_dep

In [7]:
# Register the environment to workspace
myenv.register(ws)

{
    "assetId": "azureml://locations/centralus/workspaces/b9dd55e8-c605-4c0b-b3ee-d4324e25cb2e/environments/MyEnvironment/versions/1",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20230509.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": true,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MyEnvironment",
    "p

## Provision the compute cluster

In [8]:
# Specify the name of the cluster
cluster_name = 'Compute-Cluster'

In [9]:
# Import the classes
from azureml.core.compute import AmlCompute, ComputeTarget

In [10]:
# Configuration for the cluster
compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2',
                                                       max_nodes=2)

In [11]:
# Create the compute cluster
compute_cluster = ComputeTarget.create(workspace=ws, name=cluster_name, provisioning_configuration=compute_config)
compute_cluster.wait_for_completion(show_output=True)

InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Run configuration steps

In [12]:
from azureml.core.runconfig import RunConfiguration
run_config = RunConfiguration()

run_config.target = compute_cluster
run_config.environment = myenv

## Defining the pipeline steps

In [13]:
# Importing the classes
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineData

# Access the dataset
input_ds = ws.datasets.get('Churn-Modelling-Data-SDK')

# Pipeline data object
dataFolder = PipelineData(name='datafolder', datastore=ws.get_default_datastore())

In [14]:
# Step 1- Data Processing
dataPrep_step = PythonScriptStep(name='Data Processing',
                                 source_directory='.',
                                 script_name='data_processing_script.py',
                                 inputs=[input_ds.as_named_input('raw_data')],
                                 outputs=[dataFolder],
                                 runconfig=run_config,
                                 arguments=['--datafolder', dataFolder])

In [15]:
# Step 2 - Training the Model
train_step = PythonScriptStep(name='Model Training',
                                 source_directory='.',
                                 script_name='model_training_script.py',
                                 inputs=[dataFolder],
                                 runconfig=run_config,
                                 arguments=['--datafolder', dataFolder])

## Configure and Build the Pipeline

In [16]:
steps = [dataPrep_step, train_step] # List of step names

from azureml.pipeline.core import Pipeline
new_pipeline = Pipeline(workspace=ws, steps=steps)

## Creating experiment and running the pipeline

In [17]:
# Creating the experiment
from azureml.core import Experiment
new_experiment = Experiment(workspace=ws, name='PipelineExperiment')

# Submit the experiment run
new_pipeline_run = new_experiment.submit(new_pipeline)
new_pipeline_run.wait_for_completion(show_output=True)

Created step Data Processing [0fca5e8e][a186124a-dd4b-42fe-9858-86c8e86671a7], (This step will run and generate new outputs)Created step Model Training [ed53ccaf][55310afd-5c66-4e5f-84b7-e3c60ad5da06], (This step will run and generate new outputs)

Submitted PipelineRun e03c172c-36a7-4762-9d59-5e3cb518083d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e03c172c-36a7-4762-9d59-5e3cb518083d?wsid=/subscriptions/8dd92a15-7a35-444a-99e8-7e44a3d0ae52/resourcegroups/azure-ml/workspaces/new_wrokspace&tid=20d3fbd0-4c6d-4dbc-b91c-aa00b337238c
PipelineRunId: e03c172c-36a7-4762-9d59-5e3cb518083d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e03c172c-36a7-4762-9d59-5e3cb518083d?wsid=/subscriptions/8dd92a15-7a35-444a-99e8-7e44a3d0ae52/resourcegroups/azure-ml/workspaces/new_wrokspace&tid=20d3fbd0-4c6d-4dbc-b91c-aa00b337238c
PipelineRun Status: Running


StepRunId: f0008e28-da6c-4bfc-b931-56264c048dee
Link to Azure Machine Learning Portal: https://ml.azure.com/runs

'Finished'