Skip to content

Commit bcdac5c

Browse files
authored
Jitghosh/parallelbatchscore (#289)
* Batch Scoring First Draft: Added pipeline creation and run scripts, scoring script, new environment variables, changes to env loading script, compute creation and AML environment creation scripts, and new Azure pipeline for batch scoring CI * Score copy step added * Modified bootstrap.py, updated getting started doc * Addressed PR comments * Addressed PR comments * Doc fix * Doc fix
1 parent 9056285 commit bcdac5c

19 files changed

+1311
-95
lines changed

.env.example

+33-5
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,33 @@
11
# Azure Subscription Variables
22
SUBSCRIPTION_ID = ''
3-
LOCATION = 'westeurope'
3+
LOCATION = ''
44
TENANT_ID = ''
55
BASE_NAME = ''
66
SP_APP_ID = ''
77
SP_APP_SECRET = ''
8-
RESOUCE_GROUP = 'mlops-rg'
8+
RESOURCE_GROUP = 'mlops-RG'
99

1010
# Mock build/release ID for local testing
1111
BUILD_BUILDID = '001'
1212

1313
# Azure ML Workspace Variables
14-
WORKSPACE_NAME = 'aml-workspace'
15-
EXPERIMENT_NAME = ''
14+
WORKSPACE_NAME = 'mlops-aml-ws'
15+
EXPERIMENT_NAME = 'mlopspython'
1616

1717
# AML Compute Cluster Config
1818
AML_ENV_NAME='diabetes_regression_training_env'
19+
AML_ENV_TRAIN_CONDA_DEP_FILE="conda_dependencies.yml"
1920
AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
2021
AML_COMPUTE_CLUSTER_CPU_SKU = 'STANDARD_DS2_V2'
2122
AML_CLUSTER_MAX_NODES = '4'
2223
AML_CLUSTER_MIN_NODES = '0'
2324
AML_CLUSTER_PRIORITY = 'lowpriority'
2425
# Training Config
25-
MODEL_NAME = 'sklearn_regression_model.pkl'
26+
MODEL_NAME = 'diabetes_regression_model.pkl'
2627
MODEL_VERSION = '1'
2728
TRAIN_SCRIPT_PATH = 'training/train.py'
29+
30+
2831
# AML Pipeline Config
2932
TRAINING_PIPELINE_NAME = 'Training Pipeline'
3033
MODEL_PATH = ''
@@ -51,3 +54,28 @@ ALLOW_RUN_CANCEL = 'true'
5154

5255
# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
5356
AML_REBUILD_ENVIRONMENT = 'false'
57+
58+
59+
60+
USE_GPU_FOR_SCORING = "false"
61+
AML_ENV_SCORE_CONDA_DEP_FILE="conda_dependencies_scoring.yml"
62+
AML_ENV_SCORECOPY_CONDA_DEP_FILE="conda_dependencies_scorecopy.yml"
63+
# AML Compute Cluster Config for parallel batch scoring
64+
AML_ENV_NAME_SCORING='diabetes_regression_scoring_env'
65+
AML_ENV_NAME_SCORE_COPY='diabetes_regression_score_copy_env'
66+
AML_COMPUTE_CLUSTER_NAME_SCORING = 'score-cluster'
67+
AML_COMPUTE_CLUSTER_CPU_SKU_SCORING = 'STANDARD_DS2_V2'
68+
AML_CLUSTER_MAX_NODES_SCORING = '4'
69+
AML_CLUSTER_MIN_NODES_SCORING = '0'
70+
AML_CLUSTER_PRIORITY_SCORING = 'lowpriority'
71+
AML_REBUILD_ENVIRONMENT_SCORING = 'true'
72+
BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py'
73+
BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py'
74+
75+
76+
SCORING_DATASTORE_INPUT_CONTAINER = 'input'
77+
SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv'
78+
SCORING_DATASTORE_OUTPUT_CONTAINER = 'output'
79+
SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv'
80+
SCORING_DATASET_NAME = 'diabetes_scoring_ds'
81+
SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Continuous Integration (CI) pipeline that orchestrates the batch scoring of the diabetes_regression model.
2+
3+
resources:
4+
containers:
5+
- container: mlops
6+
image: mcr.microsoft.com/mlops/python:latest
7+
8+
9+
pr: none
10+
trigger:
11+
branches:
12+
include:
13+
- master
14+
paths:
15+
include:
16+
- diabetes_regression/scoring/parallel_batchscore.py
17+
- ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py
18+
- ml_service/pipelines/run_parallel_batchscore_pipeline.py
19+
20+
variables:
21+
- template: diabetes_regression-variables-template.yml
22+
- group: devopsforai-aml-vg
23+
24+
pool:
25+
vmImage: ubuntu-latest
26+
27+
stages:
28+
- stage: 'Batch_Scoring_Pipeline_CI'
29+
displayName: 'Batch Scoring Pipeline CI'
30+
jobs:
31+
- job: "Build_Batch_Scoring_Pipeline"
32+
displayName: "Build Batch Scoring Pipeline"
33+
container: mlops
34+
timeoutInMinutes: 0
35+
steps:
36+
- template: code-quality-template.yml
37+
- task: AzureCLI@1
38+
name: publish_batchscore
39+
inputs:
40+
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
41+
scriptLocation: inlineScript
42+
workingDirectory: $(Build.SourcesDirectory)
43+
inlineScript: |
44+
set -e # fail on error
45+
export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
46+
# Invoke the Python building and publishing a training pipeline
47+
python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline
48+
49+
- job: "Run_Batch_Score_Pipeline"
50+
displayName: "Run Batch Scoring Pipeline"
51+
dependsOn: "Build_Batch_Scoring_Pipeline"
52+
timeoutInMinutes: 240
53+
pool: server
54+
variables:
55+
pipeline_id: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['publish_batchscore.pipeline_id']]
56+
steps:
57+
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
58+
displayName: 'Invoke Batch Scoring pipeline'
59+
inputs:
60+
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
61+
PipelineId: '$(pipeline_id)'
62+
ExperimentName: '$(EXPERIMENT_NAME)'
63+
PipelineParameters: '"ParameterAssignments": {"model_name": "$(MODEL_NAME)"}'
64+

.pipelines/diabetes_regression-variables-template.yml

+62
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ variables:
1616
# The path to the model scoring script relative to SOURCES_DIR_TRAIN
1717
- name: SCORE_SCRIPT
1818
value: scoring/score.py
19+
1920

2021
# Azure ML Variables
2122
- name: EXPERIMENT_NAME
@@ -35,6 +36,8 @@ variables:
3536
# AML Compute Cluster Config
3637
- name: AML_ENV_NAME
3738
value: diabetes_regression_training_env
39+
- name: AML_ENV_TRAIN_CONDA_DEP_FILE
40+
value: "conda_dependencies.yml"
3841
- name: AML_COMPUTE_CLUSTER_CPU_SKU
3942
value: STANDARD_DS2_V2
4043
- name: AML_COMPUTE_CLUSTER_NAME
@@ -69,3 +72,62 @@ variables:
6972
# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
7073
# - name: AML_REBUILD_ENVIRONMENT
7174
# value: "false"
75+
76+
# Variables below are used for controlling various aspects of batch scoring
77+
- name: USE_GPU_FOR_SCORING
78+
value: False
79+
# Conda dependencies for the batch scoring step
80+
- name: AML_ENV_SCORE_CONDA_DEP_FILE
81+
value: "conda_dependencies_scoring.yml"
82+
# Conda dependencies for the score copying step
83+
- name: AML_ENV_SCORECOPY_CONDA_DEP_FILE
84+
value: "conda_dependencies_scorecopy.yml"
85+
# AML Compute Cluster Config for parallel batch scoring
86+
- name: AML_ENV_NAME_SCORING
87+
value: diabetes_regression_scoring_env
88+
- name: AML_ENV_NAME_SCORE_COPY
89+
value: diabetes_regression_score_copy_env
90+
- name: AML_COMPUTE_CLUSTER_CPU_SKU_SCORING
91+
value: STANDARD_DS2_V2
92+
- name: AML_COMPUTE_CLUSTER_NAME_SCORING
93+
value: score-cluster
94+
- name: AML_CLUSTER_MIN_NODES_SCORING
95+
value: 0
96+
- name: AML_CLUSTER_MAX_NODES_SCORING
97+
value: 4
98+
- name: AML_CLUSTER_PRIORITY_SCORING
99+
value: lowpriority
100+
# The path to the batch scoring script relative to SOURCES_DIR_TRAIN
101+
- name: BATCHSCORE_SCRIPT_PATH
102+
value: scoring/parallel_batchscore.py
103+
- name: BATCHSCORE_COPY_SCRIPT_PATH
104+
value: scoring/parallel_batchscore_copyoutput.py
105+
# Flag to allow rebuilding the AML Environment after it was built for the first time.
106+
# This enables dependency updates from the conda dependencies yaml for scoring activities.
107+
- name: AML_REBUILD_ENVIRONMENT_SCORING
108+
value: "true"
109+
110+
# Datastore config for scoring
111+
# The storage account name and key are supplied as variables in a variable group
112+
# in the Azure Pipelines library for this project. Please refer to repo docs for
113+
# more details
114+
115+
# Blob container where the input data for scoring can be found
116+
- name: SCORING_DATASTORE_INPUT_CONTAINER
117+
value: "input"
118+
# Blobname for the input data - include any applicable path in the string
119+
- name: SCORING_DATASTORE_INPUT_FILENAME
120+
value: "diabetes_scoring_input.csv"
121+
# Blob container where the output data for scoring can be found
122+
- name: SCORING_DATASTORE_OUTPUT_CONTAINER
123+
value: "output"
124+
# Blobname for the output data - include any applicable path in the string
125+
- name: SCORING_DATASTORE_OUTPUT_FILENAME
126+
value: "diabetes_scoring_output.csv"
127+
# Dataset name for input data for scoring
128+
- name: SCORING_DATASET_NAME
129+
value: "diabetes_scoring_ds"
130+
# Scoring pipeline name
131+
- name: SCORING_PIPELINE_NAME
132+
value: "diabetes-scoring-pipeline"
133+

bootstrap/bootstrap.py

+2
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,12 @@ def replace_project_name(project_dir, project_name, rename_name):
8787
r".pipelines/diabetes_regression-ci.yml",
8888
r".pipelines/abtest.yml",
8989
r".pipelines/diabetes_regression-ci-image.yml",
90+
r".pipelines/diabetes_regression-batchscoring-ci.yml",
9091
r".pipelines/diabetes_regression-get-model-version-template.yml", # NOQA: E501
9192
r".pipelines/diabetes_regression-variables-template.yml",
9293
r"environment_setup/Dockerfile",
9394
r"environment_setup/install_requirements.sh",
95+
r"ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py", # NOQA: E501
9496
r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py", # NOQA: E501
9597
r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py", # NOQA: E501
9698
r"ml_service/pipelines/diabetes_regression_build_train_pipeline.py", # NOQA: E501
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Conda environment specification. The dependencies defined in this file will
2+
# be automatically provisioned for managed runs. These include runs against
3+
# the localdocker, remotedocker, and cluster compute targets.
4+
5+
# Note that this file is NOT used to automatically manage dependencies for the
6+
# local compute target. To provision these dependencies locally, run:
7+
# conda env update --file conda_dependencies.yml
8+
9+
# Details about the Conda environment file format:
10+
# https://conda.io/docs/using/envs.html#create-environment-file-by-hand
11+
12+
# For managing Spark packages and configuration, see spark_dependencies.yml.
13+
# Version of this configuration file's structure and semantics in AzureML.
14+
# This directive is stored in a comment to preserve the Conda file structure.
15+
# [AzureMlVersion] = 2
16+
17+
# These dependencies are used to create the environment used by the batch score
18+
# copy pipeline step
19+
name: diabetes_regression_score_copy_env
20+
dependencies:
21+
# The python interpreter version.
22+
# Currently Azure ML Workbench only supports 3.5.2 and later.
23+
- python=3.7.*
24+
- pip
25+
26+
- pip:
27+
# Base AzureML SDK
28+
- azureml-sdk==1.6.*
29+
30+
# Score copying deps
31+
- azure-storage-blob
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Conda environment specification. The dependencies defined in this file will
2+
# be automatically provisioned for managed runs. These include runs against
3+
# the localdocker, remotedocker, and cluster compute targets.
4+
5+
# Note that this file is NOT used to automatically manage dependencies for the
6+
# local compute target. To provision these dependencies locally, run:
7+
# conda env update --file conda_dependencies.yml
8+
9+
# Details about the Conda environment file format:
10+
# https://conda.io/docs/using/envs.html#create-environment-file-by-hand
11+
12+
# For managing Spark packages and configuration, see spark_dependencies.yml.
13+
# Version of this configuration file's structure and semantics in AzureML.
14+
# This directive is stored in a comment to preserve the Conda file structure.
15+
# [AzureMlVersion] = 2
16+
17+
# These dependencies are used to create the environment used by the batch score
18+
# pipeline step
19+
name: diabetes_regression_scoring_env
20+
dependencies:
21+
# The python interpreter version.
22+
# Currently Azure ML Workbench only supports 3.5.2 and later.
23+
- python=3.7.*
24+
- pip
25+
26+
- pip:
27+
# Base AzureML SDK
28+
- azureml-sdk==1.6.*
29+
30+
# Scoring deps
31+
- scikit-learn
32+
- pandas

0 commit comments

Comments
 (0)