Azure · cloga · Apr 19, 2023 · Mar 14, 2023 · Mar 15, 2023 · Mar 15, 2023
diff --git a/.github/workflows/cli-jobs-nebulaml-PyTorch_CNN_MNIST-job.yml b/.github/workflows/cli-jobs-nebulaml-PyTorch_CNN_MNIST-job.yml
@@ -0,0 +1,50 @@
+# This code is autogenerated.
+# Code is generated by running custom script: python3 readme.py
+# Any manual changes to this file may cause incorrect behavior.
+# Any manual changes will be overwritten if the code is regenerated.
+
+name: cli-jobs-nebulaml-PyTorch_CNN_MNIST-job
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "54 3/12 * * *"
+  pull_request:
+    branches:
+      - main
+    paths:
+      - cli/jobs/nebulaml/PyTorch_CNN_MNIST/**
+      - infra/**
+      - .github/workflows/cli-jobs-nebulaml-PyTorch_CNN_MNIST-job.yml
+      - cli/setup.sh
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZUREML_CREDENTIALS}}
+    - name: bootstrap resources
+      run: |
+          echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}';
+          bash bootstrap.sh
+      working-directory: infra
+      continue-on-error: false
+    - name: setup-cli
+      run: |
+          source "${{ github.workspace }}/infra/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/init_environment.sh";
+          bash setup.sh
+      working-directory: cli
+      continue-on-error: true
+    - name: run job
+      run: |
+          source "${{ github.workspace }}/infra/sdk_helpers.sh";
+          source "${{ github.workspace }}/infra/init_environment.sh";
+          bash -x ../../../run-job.sh job.yml
+      working-directory: cli/jobs/nebulaml/PyTorch_CNN_MNIST
diff --git a/cli/README.md b/cli/README.md
@@ -38,7 +38,6 @@ path|status|
 -|-
 [batch-score-rest.sh](batch-score-rest.sh)|[![batch-score-rest](https://github.com/Azure/azureml-examples/workflows/cli-scripts-batch-score-rest/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-batch-score-rest.yml)
 [batch-score.sh](batch-score.sh)|[![batch-score](https://github.com/Azure/azureml-examples/workflows/cli-scripts-batch-score/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-batch-score.yml)
-[create-registries.sh](create-registries.sh)|[![create-registries](https://github.com/Azure/azureml-examples/workflows/cli-scripts-create-registries/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-create-registries.yml)
 [deploy-custom-container-minimal-multimodel.sh](deploy-custom-container-minimal-multimodel.sh)|[![deploy-custom-container-minimal-multimodel](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-minimal-multimodel/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-minimal-multimodel.yml)
 [deploy-custom-container-minimal-single-model.sh](deploy-custom-container-minimal-single-model.sh)|[![deploy-custom-container-minimal-single-model](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-minimal-single-model/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-minimal-single-model.yml)
 [deploy-custom-container-mlflow-multideployment-scikit.sh](deploy-custom-container-mlflow-multideployment-scikit.sh)|[![deploy-custom-container-mlflow-multideployment-scikit](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-mlflow-multideployment-scikit/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-mlflow-multideployment-scikit.yml)
@@ -80,6 +79,9 @@ path|status|description
 [jobs/basics/hello-automl/hello-automl-job-basic.yml](jobs/basics/hello-automl/hello-automl-job-basic.yml)|[![jobs/basics/hello-automl/hello-automl-job-basic](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-automl-hello-automl-job-basic/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-automl-hello-automl-job-basic.yml)|A Classification job using bank marketing
 [jobs/deepspeed/deepspeed-autotuning/job.yml](jobs/deepspeed/deepspeed-autotuning/job.yml)|[![jobs/deepspeed/deepspeed-autotuning/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-deepspeed-deepspeed-autotuning-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-deepspeed-deepspeed-autotuning-job.yml)|*no description*
 [jobs/deepspeed/deepspeed-training/job.yml](jobs/deepspeed/deepspeed-training/job.yml)|[![jobs/deepspeed/deepspeed-training/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-deepspeed-deepspeed-training-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-deepspeed-deepspeed-training-job.yml)|*no description*
+[jobs/nebulaml/PyTorch_CNN_MNIST/job.yml](jobs/nebulaml/PyTorch_CNN_MNIST/job.yml)|[![jobs/nebulaml/PyTorch_CNN_MNIST/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-nebulaml-PyTorch_CNN_MNIST-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-nebulaml-PyTorch_CNN_MNIST-job.yml)|*no description*
+[jobs/nebulaml/bert-pretrain-deepspeed/job.yml](jobs/nebulaml/bert-pretrain-deepspeed/job.yml)|[![jobs/nebulaml/bert-pretrain-deepspeed/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-nebulaml-bert-pretrain-deepspeed-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-nebulaml-bert-pretrain-deepspeed-job.yml)|*no description*
+[jobs/nebulaml/cifar10_deepspeed/job.yml](jobs/nebulaml/cifar10_deepspeed/job.yml)|[![jobs/nebulaml/cifar10_deepspeed/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-nebulaml-cifar10_deepspeed-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-nebulaml-cifar10_deepspeed-job.yml)|*no description*
 [jobs/pipelines-with-components/nyc_taxi_data_regression/single-job-pipeline.yml](jobs/pipelines-with-components/nyc_taxi_data_regression/single-job-pipeline.yml)|[![jobs/pipelines-with-components/nyc_taxi_data_regression/single-job-pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-single-job-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-pipelines-with-components-nyc_taxi_data_regression-single-job-pipeline.yml)|Single job pipeline to train regression model based on nyc taxi dataset
 [jobs/single-step/dask/nyctaxi/job.yml](jobs/single-step/dask/nyctaxi/job.yml)|[![jobs/single-step/dask/nyctaxi/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-dask-nyctaxi-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-dask-nyctaxi-job.yml)|This sample shows how to run a distributed DASK job on AzureML. The 24GB NYC Taxi dataset is read in CSV format by a 4 node DASK cluster, processed and then written as job output in parquet format.
 [jobs/single-step/gpu_perf/gpu_perf_job.yml](jobs/single-step/gpu_perf/gpu_perf_job.yml)|[![jobs/single-step/gpu_perf/gpu_perf_job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-gpu_perf-gpu_perf_job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-gpu_perf-gpu_perf_job.yml)|Runs NCCL-tests on gpu nodes.
@@ -200,6 +202,9 @@ path|status|description
 [jobs/automl-standalone-jobs/cli-automl-text-ner-conll/cli-automl-text-ner-conll2003.yml](jobs/automl-standalone-jobs/cli-automl-text-ner-conll/cli-automl-text-ner-conll2003.yml)|[![jobs/automl-standalone-jobs/cli-automl-text-ner-conll/cli-automl-text-ner-conll2003](https://github.com/Azure/azureml-examples/workflows/cli-jobs-automl-standalone-jobs-cli-automl-text-ner-conll-cli-automl-text-ner-conll2003/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-automl-standalone-jobs-cli-automl-text-ner-conll-cli-automl-text-ner-conll2003.yml)|A text named entity recognition job using CoNLL 2003 data
 [responsible-ai/cli-responsibleaidashboard-housing-classification/cli-responsibleaidashboard-housing-classification.yml](responsible-ai/cli-responsibleaidashboard-housing-classification/cli-responsibleaidashboard-housing-classification.yml)|[![responsible-ai/cli-responsibleaidashboard-housing-classification/cli-responsibleaidashboard-housing-classification](https://github.com/Azure/azureml-examples/workflows/cli-responsible-ai-cli-responsibleaidashboard-housing-classification-cli-responsibleaidashboard-housing-classification/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-responsible-ai-cli-responsibleaidashboard-housing-classification-cli-responsibleaidashboard-housing-classification.yml)|*no description*
 [responsible-ai/cli-responsibleaidashboard-programmer-regression/cli-responsibleaidashboard-programmer-regression.yml](responsible-ai/cli-responsibleaidashboard-programmer-regression/cli-responsibleaidashboard-programmer-regression.yml)|[![responsible-ai/cli-responsibleaidashboard-programmer-regression/cli-responsibleaidashboard-programmer-regression](https://github.com/Azure/azureml-examples/workflows/cli-responsible-ai-cli-responsibleaidashboard-programmer-regression-cli-responsibleaidashboard-programmer-regression/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-responsible-ai-cli-responsibleaidashboard-programmer-regression-cli-responsibleaidashboard-programmer-regression.yml)|*no description*
+[jobs/parallel/1a_oj_sales_prediction/pipeline.yml](jobs/parallel/1a_oj_sales_prediction/pipeline.yml)|[![jobs/parallel/1a_oj_sales_prediction/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-parallel-1a_oj_sales_prediction-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-parallel-1a_oj_sales_prediction-pipeline.yml)|The hello world pipeline job with partition by key
+[jobs/parallel/2a_iris_batch_prediction/pipeline.yml](jobs/parallel/2a_iris_batch_prediction/pipeline.yml)|[![jobs/parallel/2a_iris_batch_prediction/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-parallel-2a_iris_batch_prediction-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-parallel-2a_iris_batch_prediction-pipeline.yml)|The hello world pipeline job with inline parallel job
+[jobs/parallel/3a_mnist_batch_identification/pipeline.yml](jobs/parallel/3a_mnist_batch_identification/pipeline.yml)|[![jobs/parallel/3a_mnist_batch_identification/pipeline](https://github.com/Azure/azureml-examples/workflows/cli-jobs-parallel-3a_mnist_batch_identification-pipeline/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-parallel-3a_mnist_batch_identification-pipeline.yml)|The hello world pipeline job with inline parallel job
 
 **Endpoints** ([endpoints](endpoints))
 

diff --git a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/README.md b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/README.md
@@ -0,0 +1,72 @@
+# PyTorch CNN training script with Nebula saving enabled
+
+This example shows how to use Nebula to save checkpoints for a PyTorch CNN training script. In this tutorial, you will run a training script with MNIST in the cloud with Azure Machine Learning. This training script is based on PyTorch and no trainer is used. 
+
+In this tutorial, **You can submit the `train.yml` YAML file to get started with PyTorch with Nebula,** and you will learn how to:
+
+- Initialize Nebula in an existing training script;
+- Save checkpoints with Nebula service;
+
+## Prerequisites
+
+As this tutorial runs in Azure Machine Learning, and the training script
+is not using and trainer, you will need to have
+
+- a workspace, compute instance, and compute cluster to use. If you don't have one, use the steps in the [Quickstart: Create workspace resources article](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources) to create one.
+- the Azure Machine Learning Python CLI installed.
+- ACPT image in the environment. See [Azure Container for PyTorch - Azure Machine Learning | Microsoft Learn](https://learn.microsoft.com/en-us/azure/machine-learning/resource-azure-container-for-pytorch) for more details about ACPT image.
+
+## Original Training Script
+
+In this tutorial, we use an [example code](https://github.com/pytorch/examples/blob/main/mnist/main.py) of PyTorch that trains a simple CNN model on MNIST with the name of `train.py`.
+
+This script downloads the MNIST dataset by using PyTorch `torchvision.dataset` APIs, sets up the CNN network defined in `Net()`, and trains it for 14 epochs by using the negative log likelihood loss and AdaDelta optimizer.
+
+## Using ACPT Environment for Azure Machine Learning
+
+To use Nebula with the training script, you need to use Azure Container for PyTorch (ACPT) image in the environment. The dependencies of Nebula are already included in the ACPT image. 
+
+Azure Container for PyTorch is a lightweight, standalone environment that includes needed components to effectively run optimized training for large models on Azure Machine Learning. You can visit [Azure Container for PyTorch - Azure Machine Learning | Microsoft Learn](https://learn.microsoft.com/en-us/azure/machine-learning/resource-azure-container-for-pytorch) to learn how to use ACPT image and [here](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-azure-container-for-pytorch-environment) to learn how to create custom curated Azure Container for PyTorch (ACPT) environments.
+
+## Initializing Nebula in the original training script
+
+To enable Nebula for fast checkpointing, you only need to modify few lines of code. Since this training script is not using any trainer, Nebula needs to be initialized manually.
+
+First, you need to import the required package `nebulaml` as:
+
+``` python
+import nebulaml as nm
+```
+
+Then, call the `nm.init()` function in `main()` to initialize Nebula, for example:
+
+``` python
+nm.init(persistent_storage_path="/tmp/tier3/test3", 
+            persistent_time_interval=2)
+```
+
+## Save Checkpoints with Nebula ⬇️
+
+After initialization, you can save your checkpoint with Nebula by replacing the original `torch.save()` with
+
+``` python
+checkpoint = nm.Checkpoint()
+checkpoint.save(<'CKPT_NAME'>, model)
+```
+
+## Submit Your Code
+
+To submit your code to Azure Machine Learning, you can run the YAML file `train.yml` in this folder. This YAML file defines the environment, the compute target, and the training script.
+
+To run your training script in your own compute resources, you should adjust the compute name `compute_name` to your own compute resource and `environment` to your ACPT image.
+
+## View your checkpointing histories
+
+When your job completed, you can navigate to your *Job Name\> Outputs + logs* page, and on the left panel, expand the folder named *nebula*, and click on *checkpointHistories.csv*. You can check the detailed information of checkpoint saving with Nebula, such as duration, throughput, and checkpoint size.
+
+## Next Step
+
+Try out another example to get a general idea of how to enable Nebula
+with your training script.
+
+- [DeepSpeed Training with CIFAR 10 dataset](./cifar10_deepspeed/README.md)
diff --git a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml
@@ -0,0 +1,37 @@
+$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
+
+# name of the experiment, use this to structure your training results
+experiment_name: nebula-mnist-example
+display_name: nebula-mnist-example
+
+code: .
+
+# compute cluster on which the command above should be run
+# note: before you can use the compute, it needs to be created first in your AzureML workspace
+compute: gpu-cluster
+
+# compute resources
+resources:
+    # number of nodes in the cluster above
+    instance_count: 1
+
+# environment in which the command above should be run in
+# note: It is recommended to use an environment which either is the ACPT image or at least inherits from it. The ACPT
+#       image has many frameworks related to PyTorch pre-installed, hence reduces efforts to create your own
+#       environment. To learn more about environments in AML, see
+#       https://learn.microsoft.com/en-us/azure/machine-learning/concept-environments
+#
+# for the latest ACPT image using PyTorch 1.12, CUDA 11.6 on Python 3.9
+# environment: azureml:acpt-pytorch-1.12-py39-cuda11.6@latest
+environment: azureml:AzureML-acpt-pytorch-2.0-cuda11.7:3
+# for a specific version
+#environment: azureml:AzureML-ACPT-pytorch-1.12-py39-cuda11.6-gpu:3
+# for the latest version of a custom environment:  (====install Nebula in dockerfile environment=====)
+#environment: azureml:ACPT-Extended@latest
+# for a specific Docker image (has to be compatible with Azure ML):
+#environment:
+#    image: mcr.microsoft.com/azureml/curated/acpt-pytorch-1.12-py39-cuda11.6-gpu:2
+
+# command that should be run by the job (====install Nebula with comand OR use dockerfile environment=====)
+command: >-
+      python train.py --save-model