CI #876
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Do not edit this file! It has been generated by .github/gen-workflow-ci.py | |
name: CI | |
on: | |
schedule: | |
# run a build on master (this does not publish test results or cancel concurrent builds) | |
- cron: '0 10 * * *' # everyday at 10am | |
push: | |
# only consider push to master, hotfix-branches, and tags | |
# otherwise modify job.config.outputs.push | |
branches: [ 'master', 'hotfix-*' ] | |
tags: [ 'v*.*.*' ] | |
pull_request: | |
# only consider pull requests into master | |
branches: [ master ] | |
workflow_dispatch: | |
permissions: {} | |
concurrency: | |
# This controls which concurrent builds to cancel: | |
# - we do not want any concurrent builds on a branch (pull_request) | |
# - we do not want concurrent builds on the same commit on master (push) | |
# - we do not want concurrent builds on the same commit on a tag (push) | |
# - we allow concurrent runs on the same commit on master and its tag (push) | |
# - we allow concurrent runs on the same commit on master (push) and a scheduled build (schedule) | |
# | |
# A pull_request event only runs on branch commit, a push event only on master and tag commit. | |
# A schedule event only runs on master HEAD commit. | |
# | |
# Expression github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch. | |
# This helps to not cancel concurrent runs on master or a tag that share the same commit. | |
# Expression github.head_ref refers to the branch of the pull request. | |
# On master, github.head_ref is empty, so we use the SHA of the commit, this means individual | |
# commits to master will not be cancelled, while there can only be one concurrent build on a branch. | |
# | |
# We include the event name to we allow for concurrent scheduled and master builds. | |
group: ci-${{ github.event_name }}-${{ github.ref }}-${{ github.head_ref || github.sha }} | |
cancel-in-progress: true | |
jobs: | |
event_file: | |
name: "Event File" | |
runs-on: ubuntu-latest | |
steps: | |
- name: Upload | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Event File | |
path: ${{ github.event_path }} | |
setup-py: | |
name: "setup.py" | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Test setup.py | |
env: | |
HOROVOD_WITHOUT_TENSORFLOW: 1 | |
HOROVOD_WITHOUT_PYTORCH: 1 | |
HOROVOD_WITHOUT_MXNET: 1 | |
HOROVOD_WITHOUT_GLOO: 1 | |
HOROVOD_WITHOUT_MPI: 1 | |
run: | | |
python -m pip install --upgrade pip | |
python -m pip install setuptools wheel | |
python setup.py sdist | |
pip -v install dist/horovod-*.tar.gz | |
init-workflow: | |
name: "Init Workflow" | |
runs-on: ubuntu-latest | |
outputs: | |
run-at-all: ${{ github.event_name != 'schedule' || github.repository == 'horovod/horovod' }} | |
# if we don't get a clear 'false', we fall back to building and testing | |
run-builds-and-tests: ${{ steps.tests.outputs.needed != 'false' }} | |
buildkite-branch-label: "${{ steps.config-buildkite.outputs.branch-label }}" | |
buildkite-message: "${{ steps.config-buildkite.outputs.message }}" | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Pip install dependencies | |
run: pip install -r .github/requirements.txt | |
- name: Check ci.yaml is up-to-date | |
run: | | |
python .github/gen-workflow-ci.py | |
if [[ $(git diff .github/workflows/ci.yaml | wc -l) -gt 0 ]] | |
then | |
echo "::error::Workflow file .github/workflows/ci.yaml is out-dated, please run .github/gen-workflow-ci.py and commit changes" | |
exit 1 | |
fi | |
shell: bash | |
- name: Check if tests are needed | |
id: tests | |
env: | |
GITHUB_BASE_SHA: ${{ github.event.pull_request.base.sha }} | |
GITHUB_HEAD_SHA: ${{ github.event.pull_request.head.sha }} | |
run: | | |
if [[ "${{ github.event_name }}" == "pull_request" ]] | |
then | |
changes="$(python .github/get-changed-code-files.py)" | |
if [[ -z "$changes" ]] | |
then | |
echo "No code changes, no need to build and test" | |
echo "needed=false" >> $GITHUB_OUTPUT | |
else | |
echo "Code changes, we need to build and test:" | |
echo "$changes" | |
echo "needed=true" >> $GITHUB_OUTPUT | |
fi | |
else | |
echo "This is not part of a pull request, we need to build and test" | |
echo "needed=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Configure Buildkite Build | |
id: config-buildkite | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
run: | | |
branch="${{ github.event.pull_request.head.ref || github.ref }}" | |
branch="${branch#"refs/heads/"}" | |
branch="${branch#"refs/tags/"}" | |
branch_label="${branch}" | |
if [[ "${{ github.event_name }}" == "schedule" ]] | |
then | |
# we add this label to the branch used by Buildkite to avoid it cancelling one of concurrent schedule and push builds on master | |
branch_label="${branch} (schedule)" | |
fi | |
echo "branch-label=${branch_label}" >> $GITHUB_OUTPUT | |
if [[ "${{ github.event_name }}" == "pull_request" ]] | |
then | |
head_sha="${{ github.event.pull_request.head.sha }}" | |
message="$(gh api https://api.github.com/repos/horovod/horovod/commits/${head_sha} -q .commit.message | head -n1)" | |
echo "message=${message}" >> $GITHUB_OUTPUT | |
fi | |
- name: Provide PR meta | |
if: github.event_name == 'pull_request' | |
run: | | |
rm -f pr.json | |
echo -n "{" >> pr.json | |
echo -n " \"merge_sha\": \"${{ github.sha }}\"," >> pr.json | |
echo -n " \"base_sha\": \"${{ github.event.pull_request.base.sha }}\"," >> pr.json | |
echo -n " \"head_sha\": \"${{ github.event.pull_request.head.sha }}\" " >> pr.json | |
echo -n "}" >> pr.json | |
cat pr.json | |
- name: Upload PR meta | |
uses: actions/upload-artifact@v4 | |
if: github.event_name == 'pull_request' | |
with: | |
name: PR Meta | |
path: pr.json | |
build-and-test: | |
name: "Build and Test (${{ matrix.image }})" | |
needs: [init-workflow] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 10 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_8_1-mxnet1_5_1_p0-pyspark3_3_1 | |
Elastic_Spark_TensorFlow_Tests_2: true | |
Elastic_Tests_2: true | |
Gloo_Cluster_PyTests: true | |
Gloo_Keras_MNIST: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_MNIST: true | |
Single_Keras_MNIST: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Keras_MNIST: true | |
Spark_Keras_Rossmann_Estimator: true | |
Spark_Keras_Rossmann_Run: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_7-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark2_4_8 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_8_0_p0-pyspark3_3_1 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Data_Service: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_2_3 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
Elastic_Spark_TensorFlow_Tests_1: true | |
Elastic_Spark_Torch_Tests: true | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-gloo-py3_8-tf2_9_3-keras2_9_0-torch1_11_0-mxnet1_7_0_p2-pyspark3_3_1 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Data_Service: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-mpich-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-openmpi-gloo-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-cpu-openmpi-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
MPI_Cluster_PyTests: true | |
MPI_MXNet_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_8_1-mxnet1_5_1_p0-pyspark3_3_1 | |
build_timeout: 40 | |
- image: test-gpu-gloo-py3_8-tf2_10_1-keras2_10_0-torch1_12_1-mxnet1_8_0_p0-pyspark3_3_1 | |
build_timeout: 40 | |
- image: test-gpu-gloo-py3_8-tf2_9_3-keras2_9_0-torch1_11_0-mxnet1_7_0_p1-pyspark3_3_1 | |
build_timeout: 40 | |
- image: test-gpu-openmpi-gloo-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
build_timeout: 40 | |
- image: test-mixed-openmpi-gloo-py3_8-tf2_11_0-keras2_11_0-torch1_13_0-mxnet1_9_1-pyspark3_3_1 | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Setup docker-compose | |
run: pip install docker-compose | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 1 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 2 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 3 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 1 of 3]" | |
id: Elastic_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 2 of 3]" | |
id: Elastic_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 3 of 3]" | |
id: Elastic_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 1 of 3]" | |
id: Elastic_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 2 of 3]" | |
id: Elastic_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 3 of 3]" | |
id: Elastic_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 1 of 3]" | |
id: Gloo_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 2 of 3]" | |
id: Gloo_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 3 of 3]" | |
id: Gloo_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 1 of 3]" | |
id: Gloo_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 2 of 3]" | |
id: Gloo_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 3 of 3]" | |
id: Gloo_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 1 of 3]" | |
id: Gloo_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 2 of 3]" | |
id: Gloo_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 3 of 3]" | |
id: Gloo_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 1 of 3]" | |
id: Gloo_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 2 of 3]" | |
id: Gloo_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 3 of 3]" | |
id: Gloo_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 1 of 3]" | |
id: MPI_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 2 of 3]" | |
id: MPI_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 3 of 3]" | |
id: MPI_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 1 of 3]" | |
id: Run_PyTests_test_interactiverun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 2 of 3]" | |
id: Run_PyTests_test_interactiverun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 3 of 3]" | |
id: Run_PyTests_test_interactiverun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 1 of 3]" | |
id: Single_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 2 of 3]" | |
id: Single_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 3 of 3]" | |
id: Single_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 1 of 3]" | |
id: Single_MXNet2_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 2 of 3]" | |
id: Single_MXNet2_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 3 of 3]" | |
id: Single_MXNet2_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 1 of 3]" | |
id: Spark_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 2 of 3]" | |
id: Spark_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 3 of 3]" | |
id: Spark_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 1 of 3]" | |
id: Spark_Lightning_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 2 of 3]" | |
id: Spark_Lightning_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 3 of 3]" | |
id: Spark_Lightning_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark PyTests [attempt 1 of 3]" | |
id: Spark_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 2 of 3]" | |
id: Spark_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 3 of 3]" | |
id: Spark_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 1 of 3]" | |
id: Spark_Torch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 2 of 3]" | |
id: Spark_Torch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 3 of 3]" | |
id: Spark_Torch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-and-test-heads: | |
name: "Build and Test heads (${{ matrix.image }})" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 2 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_3_1 | |
Elastic_Tests_1: true | |
Gloo_Cluster_PyTests: true | |
Gloo_MXNet2_MNIST_api: true | |
Gloo_MXNet2_MNIST_horovodrun: true | |
Gloo_Parallel_PyTests: true | |
Gloo_PyTorch_MNIST_api: true | |
Gloo_PyTorch_MNIST_horovodrun: true | |
Gloo_Single_PyTests: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_api: true | |
Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_api: true | |
Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun: true | |
Gloo_TensorFlow_2_0_MNIST_api: true | |
Gloo_TensorFlow_2_0_MNIST_horovodrun: true | |
MPI_Cluster_PyTests: true | |
MPI_MXNet2_MNIST_api: true | |
MPI_MXNet2_MNIST_horovodrun: true | |
MPI_Parallel_PyTests: true | |
MPI_PyTorch_MNIST_api: true | |
MPI_PyTorch_MNIST_horovodrun: true | |
MPI_Single_PyTests: true | |
MPI_TensorFlow_2_0_Keras_MNIST_api: true | |
MPI_TensorFlow_2_0_Keras_MNIST_horovodrun: true | |
MPI_TensorFlow_2_0_MNIST_api: true | |
MPI_TensorFlow_2_0_MNIST_horovodrun: true | |
Run_PyTests_test_interactiverun: true | |
Single_MXNet2_MNIST: true | |
Single_PyTorch_MNIST: true | |
Spark_Lightning_MNIST: true | |
Spark_PyTests: true | |
Spark_TensorFlow_2_0_MNIST_Data_Service: true | |
Spark_Torch_MNIST: true | |
build_timeout: 30 | |
- image: test-gpu-openmpi-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_3_1 | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Setup docker-compose | |
run: pip install docker-compose | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 1 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_1 && steps.Elastic_Spark_TensorFlow_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 1 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 2 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark TensorFlow Tests 2 [attempt 3 of 3]" | |
id: Elastic_Spark_TensorFlow_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_TensorFlow_Tests_2 && steps.Elastic_Spark_TensorFlow_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_TensorFlow_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.tf.xml test_elastic_spark_tensorflow.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 1 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 2 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Spark Torch Tests [attempt 3 of 3]" | |
id: Elastic_Spark_Torch_Tests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Spark_Torch_Tests && steps.Elastic_Spark_Torch_Tests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Spark_Torch_Tests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && /spark_env.sh HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.spark.torch.xml test_elastic_spark_torch.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 1 of 3]" | |
id: Elastic_Tests_1_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 2 of 3]" | |
id: Elastic_Tests_1_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 1 [attempt 3 of 3]" | |
id: Elastic_Tests_1_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_1 && steps.Elastic_Tests_1_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_1_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 1 of 3]" | |
id: Elastic_Tests_2_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 2 of 3]" | |
id: Elastic_Tests_2_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Elastic Tests 2 [attempt 3 of 3]" | |
id: Elastic_Tests_2_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Elastic_Tests_2 && steps.Elastic_Tests_2_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Elastic_Tests_2_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow.py test_elastic_tensorflow_keras.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 1 of 3]" | |
id: Gloo_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 2 of 3]" | |
id: Gloo_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Cluster PyTests [attempt 3 of 3]" | |
id: Gloo_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Cluster_PyTests && steps.Gloo_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py" | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 1 of 3]" | |
id: Gloo_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 2 of 3]" | |
id: Gloo_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo Keras MNIST [attempt 3 of 3]" | |
id: Gloo_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Keras_MNIST && steps.Gloo_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras/keras_mnist_advanced.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST api [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_api && steps.Gloo_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet2_MNIST_horovodrun && steps.Gloo_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_MXNet_MNIST_horovodrun && steps.Gloo_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet_mnist.py | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 1 of 3]" | |
id: Gloo_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 2 of 3]" | |
id: Gloo_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo Parallel PyTests [attempt 3 of 3]" | |
id: Gloo_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Parallel_PyTests && steps.Gloo_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo /bin/bash /pytest.sh gloo)" | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST api [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_api && steps.Gloo_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_PyTorch_MNIST_horovodrun && steps.Gloo_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 1 of 3]" | |
id: Gloo_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 2 of 3]" | |
id: Gloo_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo Single PyTests [attempt 3 of 3]" | |
id: Gloo_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_Single_PyTests && steps.Gloo_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_api && steps.Gloo_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Data_Service && steps.Gloo_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "horovodrun -np 2 python -m horovod.tensorflow.data.compute_worker /tmp/compute.json & horovodrun -np 2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_api && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py 2 2 2 localhost:2,127.0.0.1:2 | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST Elastic horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_Elastic_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 --min-np 2 --max-np 2 -H localhost:2,127.0.0.1:2 --gloo python /horovod/examples/elastic/tensorflow2/tensorflow2_mnist_elastic.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_api && steps.Gloo_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 gloo | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_2_0_MNIST_horovodrun && steps.Gloo_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 1 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 2 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "Gloo TensorFlow MNIST [attempt 3 of 3]" | |
id: Gloo_TensorFlow_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Gloo_TensorFlow_MNIST && steps.Gloo_TensorFlow_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Gloo_TensorFlow_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow/tensorflow_mnist.py | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests && steps.MPI_Cluster_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_MPI && steps.MPI_Cluster_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI Cluster PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Cluster_PyTests_ONECCL_OFI && steps.MPI_Cluster_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Cluster_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.static.xml test_static_run.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST api [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_api && steps.MPI_MXNet2_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet2 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet2_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet2_MNIST_horovodrun && steps.MPI_MXNet2_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet2_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet2_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun && steps.MPI_MXNet_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI MXNet MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI && steps.MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_MXNet_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && OMP_NUM_THREADS=1 \$(cat /mpirun_command) python /horovod/examples/mxnet/mxnet_mnist.py" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests && steps.MPI_Parallel_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_MPI && steps.MPI_Parallel_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI Parallel PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Parallel_PyTests_ONECCL_OFI && steps.MPI_Parallel_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Parallel_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/parallel && (ls -1 test_*.py | xargs -n 1 \$(cat /mpirun_command) /bin/bash /pytest.sh mpi)" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api && steps.MPI_PyTorch_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_MPI && steps.MPI_PyTorch_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_api_ONECCL_OFI && steps.MPI_PyTorch_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun && steps.MPI_PyTorch_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI PyTorch MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI && steps.MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_PyTorch_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 1 of 3]" | |
id: MPI_Single_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 2 of 3]" | |
id: MPI_Single_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [attempt 3 of 3]" | |
id: MPI_Single_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests && steps.MPI_Single_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c " cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_MPI && steps.MPI_Single_PyTests_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI Single PyTests [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_Single_PyTests_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_Single_PyTests_ONECCL_OFI && steps.MPI_Single_PyTests_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_Single_PyTests_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 15m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh mpi)" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 Keras MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_Keras_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api && steps.MPI_TensorFlow_2_0_MNIST_api_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST api [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_api_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/tensorflow2/tensorflow2_mnist.py 2 localhost:2 mpi" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL MPI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 1 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 2 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "MPI TensorFlow 2.0 MNIST horovodrun [ONECCL OFI] [attempt 3 of 3]" | |
id: MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI && steps.MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/MPI_TensorFlow_2_0_MNIST_horovodrun_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && \$(cat /mpirun_command) python /horovod/examples/tensorflow2/tensorflow2_mnist.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 1 of 3]" | |
id: Run_PyTests_test_interactiverun_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 2 of 3]" | |
id: Run_PyTests_test_interactiverun_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Run PyTests test_interactiverun [attempt 3 of 3]" | |
id: Run_PyTests_test_interactiverun_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Run_PyTests_test_interactiverun && steps.Run_PyTests_test_interactiverun_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Run_PyTests_test_interactiverun_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/test && pytest -v --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.mpi.integration.xml integration/test_interactiverun.py" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 1 of 3]" | |
id: Single_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 2 of 3]" | |
id: Single_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single Keras MNIST [attempt 3 of 3]" | |
id: Single_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_Keras_MNIST && steps.Single_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/keras/keras_mnist_advanced.py --epochs 3 --batch-size 64" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 1 of 3]" | |
id: Single_MXNet2_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 2 of 3]" | |
id: Single_MXNet2_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet2 MNIST [attempt 3 of 3]" | |
id: Single_MXNet2_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet2_MNIST && steps.Single_MXNet2_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet2_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet2_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST && steps.Single_MXNet_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_MPI && steps.Single_MXNet_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single MXNet MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_MXNet_MNIST_ONECCL_OFI && steps.Single_MXNet_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_MXNet_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/mxnet/mxnet_mnist.py --epochs 3" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST && steps.Single_PyTorch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c " python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL MPI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_MPI && steps.Single_PyTorch_MNIST_ONECCL_MPI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_MPI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_mpi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 1 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 2 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Single PyTorch MNIST [ONECCL OFI] [attempt 3 of 3]" | |
id: Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Single_PyTorch_MNIST_ONECCL_OFI && steps.Single_PyTorch_MNIST_ONECCL_OFI_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Single_PyTorch_MNIST_ONECCL_OFI_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "\$(cat /oneccl_env) && echo '/mpirun_command_ofi' > /mpirun_command && python /horovod/examples/pytorch/pytorch_mnist.py --epochs 3 --data-dir /data/pytorch_datasets" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 1 of 3]" | |
id: Spark_Keras_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 2 of 3]" | |
id: Spark_Keras_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras MNIST [attempt 3 of 3]" | |
id: Spark_Keras_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_MNIST && steps.Spark_Keras_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Estimator [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Estimator_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Estimator && steps.Spark_Keras_Rossmann_Estimator_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Estimator_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_estimator.py --num-proc 2 --work-dir /work --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 1 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 2 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Keras Rossmann Run [attempt 3 of 3]" | |
id: Spark_Keras_Rossmann_Run_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Keras_Rossmann_Run && steps.Spark_Keras_Rossmann_Run_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Keras_Rossmann_Run_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/keras/keras_spark_rossmann_run.py --num-proc 2 --data-dir file:///data --epochs 3 --sample-rate 0.1" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 1 of 3]" | |
id: Spark_Lightning_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 2 of 3]" | |
id: Spark_Lightning_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Lightning MNIST [attempt 3 of 3]" | |
id: Spark_Lightning_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Lightning_MNIST && steps.Spark_Lightning_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Lightning_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark PyTests [attempt 1 of 3]" | |
id: Spark_PyTests_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 2 of 3]" | |
id: Spark_PyTests_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark PyTests [attempt 3 of 3]" | |
id: Spark_PyTests_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_PyTests && steps.Spark_PyTests_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_PyTests_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_PyTests_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 30m bash -c "cd /horovod/test/integration && (ls -1 test_spark*.py | xargs -n 1 /bin/bash /pytest_standalone.sh spark)" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 1 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 2 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark TensorFlow 2.0 MNIST Data Service [attempt 3 of 3]" | |
id: Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_TensorFlow_2_0_MNIST_Data_Service && steps.Spark_TensorFlow_2_0_MNIST_Data_Service_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_TensorFlow_2_0_MNIST_Data_Service_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "cd /horovod/examples/spark/tensorflow2; spark-submit --master \"local[2]\" \"/horovod/horovod/spark/tensorflow/compute_worker.py\" /tmp/compute.json & OMP_NUM_THREADS=1 /spark_env.sh spark-submit --master \"local[2]\" --py-files tensorflow2_mnist_data_service_train_fn_compute_side_dispatcher.py,tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py tensorflow2_mnist_data_service.py /tmp/compute.json" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 1 of 3]" | |
id: Spark_Torch_MNIST_run_1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && true | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_1:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 2 of 3]" | |
id: Spark_Torch_MNIST_run_2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_1.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_2:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: "Spark Torch MNIST [attempt 3 of 3]" | |
id: Spark_Torch_MNIST_run_3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && matrix.Spark_Torch_MNIST && steps.Spark_Torch_MNIST_run_2.outcome == 'failure' | |
run: | | |
mkdir -p artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3 | |
docker-compose -f docker-compose.test.yml run -e GITHUB_ACTIONS --rm --volume "$(pwd)/artifacts/${{ matrix.image }}/Spark_Torch_MNIST_run_3:/artifacts" ${{ matrix.image }} /usr/bin/timeout 10m bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3" | |
shell: bash | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-mins: | |
name: "Build mins (${{ matrix.image }})" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: ubuntu-latest | |
strategy: | |
max-parallel: 2 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin | |
build_timeout: 30 | |
- image: test-gpu-openmpi-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin | |
build_timeout: 40 | |
steps: | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: 3.8 | |
- name: Setup docker-compose | |
run: pip install docker-compose | |
- name: Build | |
id: build | |
run: | | |
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml build ${{ matrix.image }} | |
env: | |
COMPOSE_DOCKER_CLI_BUILD: 1 | |
DOCKER_BUILDKIT: 1 | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() && contains(matrix.image, '-cpu-') | |
with: | |
name: Unit Test Results - ${{ matrix.image }} | |
path: artifacts/${{ matrix.image }}/**/*.xml | |
build-and-test-macos: | |
name: "Build and Test macOS (${{ matrix.image }}-macos)" | |
needs: [init-workflow, build-and-test] | |
if: > | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' | |
runs-on: macos-11 | |
strategy: | |
max-parallel: 3 | |
fail-fast: false | |
matrix: | |
include: | |
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0 | |
HOROVOD_WITH_MPI: 1 | |
HOROVOD_WITHOUT_GLOO: 1 | |
TENSORFLOW: 1.15.0 | |
KERAS: 2.2.4 | |
PYTORCH: 1.6.0 | |
PYTORCH_LIGHTNING: 1.3.8 | |
TORCHVISION: 0.7.0 | |
MXNET: 1.5.1.post0 | |
- image: test-cpu-gloo-py3_8-tf2_9_2-keras2_9_0-torch1_11_0-mxnet1_7_0_p2 | |
HOROVOD_WITHOUT_MPI: 1 | |
HOROVOD_WITH_GLOO: 1 | |
TENSORFLOW: 2.9.2 | |
KERAS: 2.9.0 | |
PYTORCH: 1.11.0 | |
PYTORCH_LIGHTNING: 1.5.9 | |
TORCHVISION: 0.12.0 | |
MXNET: 1.7.0.post2 | |
- image: test-openmpi-cpu-gloo-py3_8-tf2_10_0-keras2_10_0-torch1_12_1-mxnet1_9_1 | |
HOROVOD_WITH_MPI: 1 | |
HOROVOD_WITH_GLOO: 1 | |
TENSORFLOW: 2.10.0 | |
KERAS: 2.10.0 | |
PYTORCH: 1.12.1 | |
PYTORCH_LIGHTNING: 1.5.9 | |
TORCHVISION: 0.13.1 | |
MXNET: 1.9.1 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- name: Build | |
id: build | |
env: | |
HOROVOD_WITH_MPI: ${{ matrix.HOROVOD_WITH_MPI }} | |
HOROVOD_WITHOUT_MPI: ${{ matrix.HOROVOD_WITHOUT_MPI }} | |
HOROVOD_WITH_GLOO: ${{ matrix.HOROVOD_WITH_GLOO }} | |
HOROVOD_WITHOUT_GLOO: ${{ matrix.HOROVOD_WITHOUT_GLOO }} | |
TENSORFLOW: ${{ matrix.TENSORFLOW }} | |
KERAS: ${{ matrix.KERAS }} | |
PYTORCH: ${{ matrix.PYTORCH }} | |
PYTORCH_LIGHTNING: ${{ matrix.PYTORCH_LIGHTNING }} | |
TORCHVISION: ${{ matrix.TORCHVISION }} | |
MXNET: ${{ matrix.MXNET }} | |
# The python patch in the pyenv install step is to work around an incompatibility introduced in new xcode version in macOS Big Sur. The patch is provided by python team. | |
# The original discussion is here https://github.com/pyenv/pyenv/issues/1737 | |
run: | | |
brew reinstall -f zlib bzip2 | |
brew install -f openmpi cmake libuv pyenv coreutils curl | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv uninstall -f 3.7.7 | |
CFLAGS="-I$(brew --prefix bzip2)/include -I$(brew --prefix zlib)/include" LDFLAGS="-L$(brew --prefix zlib)/lib -L$(brew --prefix bzip2)/lib" pyenv install --patch 3.7.7 < <(curl -sSL https://github.com/python/cpython/commit/8ea6353.patch) | |
pyenv global 3.7.7 | |
python --version | |
python -m pip install -U pip | |
pip install tensorflow==${TENSORFLOW} keras==${KERAS} | |
if [[ ${TENSORFLOW} == 1.* ]] || [[ ${TENSORFLOW} == 2.[012345].* ]]; then pip install "h5py<3" "protobuf~=3.20"; fi | |
pip install torch==${PYTORCH} pytorch_lightning==${PYTORCH_LIGHTNING} torchvision==${TORCHVISION} | |
pip install mxnet==${MXNET} | |
HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir .[test] | |
horovodrun --check-build | |
- name: Test [attempt 1 of 3] | |
id: test-1 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && true | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-1" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Test [attempt 2 of 3] | |
id: test-2 | |
continue-on-error: true | |
if: always() && steps.build.outcome == 'success' && steps.test-1.outcome == 'failure' | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-2" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Test [attempt 3 of 3] | |
id: test-3 | |
continue-on-error: false | |
if: always() && steps.build.outcome == 'success' && steps.test-2.outcome == 'failure' | |
run: | | |
export PATH=$(pyenv root)/shims:$PATH | |
pyenv global 3.7.7 | |
python --version | |
artifacts_path="$(pwd)/artifacts/${{ matrix.image }}-macos-run-3" | |
mkdir -p "$artifacts_path" | |
echo "artifacts-path=$artifacts_path" >> $GITHUB_OUTPUT | |
echo pytest -v --capture=no --continue-on-collection-errors --junit-xml=$artifacts_path/junit.\$1.\${HOROVOD_RANK:-\${OMPI_COMM_WORLD_RANK:-\${PMI_RANK}}}.\$2.xml \${@:2} > pytest.sh | |
chmod u+x pytest.sh | |
cd test/parallel | |
ls test_*.py | gtimeout 10m xargs -n 1 horovodrun -np 2 /bin/bash ../../pytest.sh macos | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - ${{ matrix.image }}-macos | |
path: | | |
${{ steps.test-1.outputs.artifacts-path }} | |
${{ steps.test-2.outputs.artifacts-path }} | |
${{ steps.test-3.outputs.artifacts-path }} | |
buildkite-trigger: | |
name: "Build and Test GPU (trigger Builtkite)" | |
needs: [init-workflow, build-and-test] | |
runs-on: ubuntu-latest | |
if: > | |
github.repository == 'horovod/horovod' && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository ) | |
outputs: | |
url: ${{ steps.build.outputs.url }} | |
steps: | |
- name: Trigger Buildkite Pipeline | |
id: build | |
uses: EnricoMi/trigger-pipeline-action@master | |
env: | |
PIPELINE: "horovod/horovod" | |
# COMMIT is taken from GITHUB_SHA | |
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU NON HEADS)" | |
# empty MESSAGE will be filled by Buildkite from commit message | |
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}" | |
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }} | |
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU NON HEADS\"}" | |
buildkite: | |
name: "Build and Test GPU (download Builtkite)" | |
needs: [buildkite-trigger] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Download Buildkite Artifacts | |
id: download | |
uses: EnricoMi/download-buildkite-artifact-action@v1 | |
with: | |
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} | |
buildkite_build_url: ${{ needs.buildkite-trigger.outputs.url }} | |
ignore_build_states: blocked,canceled,skipped,not_run | |
ignore_job_states: timed_out | |
output_path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - GPU NON HEADS on Builtkite | |
path: artifacts/Unit Test Results - GPU NON HEADS on Builtkite/**/*.xml | |
- name: Check Buildkite job state | |
if: > | |
always() && | |
steps.download.conclusion == 'success' && | |
steps.download.outputs.build-state != 'passed' | |
run: | | |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-trigger.outputs.url }}" | |
exit 1 | |
buildkite-heads-trigger: | |
name: "Build and Test GPU heads (trigger Builtkite)" | |
needs: [init-workflow, build-and-test] | |
runs-on: ubuntu-latest | |
if: > | |
github.repository == 'horovod/horovod' && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository ) | |
outputs: | |
url: ${{ steps.build.outputs.url }} | |
steps: | |
- name: Trigger Buildkite Pipeline | |
id: build | |
uses: EnricoMi/trigger-pipeline-action@master | |
env: | |
PIPELINE: "horovod/horovod" | |
# COMMIT is taken from GITHUB_SHA | |
BRANCH: "${{ needs.init-workflow.outputs.buildkite-branch-label }} (GPU HEADS)" | |
# empty MESSAGE will be filled by Buildkite from commit message | |
MESSAGE: "${{ needs.init-workflow.outputs.buildkite-message }}" | |
BUILDKITE_API_ACCESS_TOKEN: ${{ secrets.BUILDKITE_TOKEN }} | |
BUILD_ENV_VARS: "{\"PIPELINE_MODE\": \"GPU HEADS\"}" | |
buildkite-heads: | |
name: "Build and Test GPU heads (download Builtkite)" | |
needs: [buildkite-heads-trigger] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Download Buildkite Artifacts | |
id: download | |
uses: EnricoMi/download-buildkite-artifact-action@v1 | |
with: | |
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} | |
buildkite_build_url: ${{ needs.buildkite-heads-trigger.outputs.url }} | |
ignore_build_states: blocked,canceled,skipped,not_run | |
ignore_job_states: timed_out | |
output_path: artifacts/Unit Test Results - GPU HEADS on Builtkite | |
- name: Upload Test Results | |
uses: actions/upload-artifact@v4 | |
if: always() | |
with: | |
name: Unit Test Results - GPU HEADS on Builtkite | |
path: artifacts/Unit Test Results - GPU HEADS on Builtkite/**/*.xml | |
- name: Check Buildkite job state | |
if: > | |
always() && | |
steps.download.conclusion == 'success' && | |
steps.download.outputs.build-state != 'passed' | |
run: | | |
echo "::warning::Buildkite pipeline did not pass: ${{ needs.buildkite-heads-trigger.outputs.url }}" | |
exit 1 | |
docker-config: | |
name: Configure docker build | |
needs: [init-workflow, build-and-test, buildkite] | |
# build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests) | |
# buildkite might have been skipped (workflow runs for a fork PR), | |
# we still want to build docker images (though we might not want to push them) | |
if: > | |
always() && | |
needs.init-workflow.outputs.run-at-all == 'true' && | |
needs.init-workflow.outputs.run-builds-and-tests == 'true' && | |
needs.build-and-test.result == 'success' && | |
( needs.buildkite.result == 'success' || needs.buildkite.result == 'skipped' ) | |
runs-on: ubuntu-latest | |
outputs: | |
run: ${{ steps.config.outputs.run }} | |
push: ${{ steps.config.outputs.push }} | |
steps: | |
- name: Config | |
id: config | |
env: | |
# run workflow for all events on Horovod repo and non-schedule events on forks | |
run: ${{ github.repository == 'horovod/horovod' || github.event_name != 'schedule' }} | |
# push images only from Horovod repo and for schedule and push events | |
push: ${{ github.repository == 'horovod/horovod' && contains('schedule,push', github.event_name) }} | |
run: | | |
echo Repository: ${{ github.repository }} | |
echo Event: ${{ github.event_name }} | |
echo Run: $run | |
echo "run=$run" >> $GITHUB_OUTPUT | |
echo Push: $push | |
echo "push=$push" >> $GITHUB_OUTPUT | |
docker-build: | |
name: Build docker image ${{ matrix.docker-image }} (push=${{ needs.docker-config.outputs.push }}) | |
needs: docker-config | |
if: always() && needs.docker-config.outputs.run == 'true' | |
runs-on: ubuntu-latest | |
# we want an ongoing run of this workflow to be canceled by a later commit | |
# so that there is only one concurrent run of this workflow for each branch | |
concurrency: | |
# github.ref means something like refs/heads/master or refs/tags/v0.22.1 or the branch. | |
# This helps to not cancel concurrent runs on master and a tag that share the same commit | |
# head_ref refers to the pull request branch so we run only one workflow for the given pull request. | |
# On master, head_ref is empty, so we use the SHA of the commit, this means | |
# commits to master will not be cancelled, which is important to ensure | |
# that every commit to master is full tested and deployed. | |
group: docker-${{ matrix.docker-image }}-${{ github.ref }}-${{ github.head_ref || github.sha }} | |
cancel-in-progress: true | |
strategy: | |
fail-fast: false | |
matrix: | |
docker-image: | |
- horovod | |
- horovod-cpu | |
- horovod-nvtabular | |
- horovod-ray | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
with: | |
submodules: 'recursive' | |
- name: Docker meta | |
id: meta | |
uses: crazy-max/ghaction-docker-meta@v5 | |
with: | |
# list of Docker images to use as base name for tags | |
images: | | |
horovod/${{ matrix.docker-image }} | |
# generate Docker tags based on the following events/attributes | |
tags: | | |
type=schedule | |
type=ref,event=branch | |
type=ref,event=pr | |
type=semver,pattern={{version}} | |
type=semver,pattern={{major}}.{{minor}} | |
type=semver,pattern={{major}} | |
type=sha | |
- name: Set up Docker Buildx | |
uses: docker/setup-buildx-action@v3 | |
with: | |
driver: docker | |
- name: Login to DockerHub | |
if: needs.docker-config.outputs.push == 'true' | |
uses: docker/login-action@v3 | |
with: | |
username: ${{ secrets.DOCKERHUB_USERNAME }} | |
password: ${{ secrets.DOCKERHUB_TOKEN }} | |
- name: Clean up disk space | |
# deleting these paths frees 38 GB disk space: | |
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc | |
# but this sometimes takes 3-4 minutes | |
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB | |
run: | | |
echo ::group::Disk space before clean up | |
df -h | |
echo ::endgroup:: | |
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \ | |
/usr/share/dotnet/shared \ | |
/usr/local/lib/android/sdk/ndk \ | |
/usr/local/lib/android/sdk/build-tools \ | |
/opt/ghc | |
do | |
echo ::group::Deleting "$dir" | |
sudo du -hsc $dir | tail -n1 || true | |
sudo rm -rf $dir | |
echo ::endgroup:: | |
done | |
echo ::group::Disk space after clean up | |
df -h | |
echo ::endgroup:: | |
- name: Build image | |
id: build | |
uses: docker/build-push-action@v5 | |
timeout-minutes: 60 | |
with: | |
context: . | |
file: ./docker/${{ matrix.docker-image }}/Dockerfile | |
pull: true | |
push: false | |
load: true | |
tags: horovod-test | |
outputs: type=docker | |
- name: List image | |
run: | | |
docker image ls horovod-test | |
- name: Prepare container for test | |
run: | | |
grep "RUN sed" Dockerfile.test.cpu | sed "s/^RUN //" | docker run -i --name horovod-test horovod-test:latest /bin/bash | |
- name: Test image (pytorch gloo) | |
if: always() && steps.build.outcome == 'success' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication gloo" | |
- name: Test image (tensorflow2 gloo) | |
if: always() && steps.build.outcome == 'success' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 gloo" | |
- name: Test image (pytorch mpi) | |
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets --num-proc 2 --hosts localhost:2 --communication mpi" | |
- name: Test image (tensorflow2 mpi) | |
if: always() && steps.build.outcome == 'success' && matrix.docker-image != 'horovod-ray' | |
run: | | |
docker start -ai horovod-test <<<"python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py 2 localhost:2 mpi" | |
- name: Push image | |
if: needs.docker-config.outputs.push == 'true' | |
uses: docker/build-push-action@v5 | |
timeout-minutes: 60 | |
with: | |
context: . | |
file: ./docker/${{ matrix.docker-image }}/Dockerfile | |
push: ${{ needs.docker-config.outputs.push }} | |
tags: ${{ steps.meta.outputs.tags }} | |
labels: ${{ steps.meta.outputs.labels }} | |
- name: Show free space | |
if: always() | |
run: | | |
echo ::group::Disk Space | |
df -h | |
echo ::endgroup:: | |
echo ::group::Docker Space | |
docker system df | |
echo ::endgroup:: | |
echo ::group::Docker Images | |
docker images -a | |
echo ::endgroup:: | |
echo ::group::Docker Container | |
docker container list -a | |
echo ::endgroup:: | |
sync-files: | |
name: "Sync Files (${{ matrix.name }})" | |
needs: [init-workflow] | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- name: Docs Summary | |
left_file: README.rst | |
right_file: docs/summary.rst | |
init: sed -i -e s/docs\///g README.rst | |
- name: Examples Keras Spark3 | |
left_file: examples/spark/keras/keras_spark_rossmann_run.py | |
right_file: examples/spark/keras/keras_spark3_rossmann.py | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Diffing ${{ matrix.left_file }} with ${{ matrix.right_file }} | |
env: | |
LEFT: ${{ matrix.left_file }} | |
RIGHT: ${{ matrix.right_file }} | |
INIT: ${{ matrix.init }} | |
run: | | |
$INIT | |
patch --quiet -p0 $LEFT ${RIGHT}.patch -o ${LEFT}.expected | |
if ! diff -q ${LEFT}.expected --label $LEFT $RIGHT | |
then | |
echo | |
echo "::error::Files are out-of-sync: $LEFT vs. $RIGHT" | |
echo "Unexpected differences are:" | |
diff ${LEFT}.expected --label $LEFT $RIGHT || true | |
echo | |
echo "Use the following as ${RIGHT}.patch to accept those changes:" | |
diff $LEFT $RIGHT || true | |
false | |
fi |