diff --git a/benchmarks/maxtest/getting_started.md b/benchmarks/maxtest/getting_started.md index e272e9eab..3037a1b02 100644 --- a/benchmarks/maxtest/getting_started.md +++ b/benchmarks/maxtest/getting_started.md @@ -44,6 +44,21 @@ EXIT_CODE=0 - maxtest.sh will generate a YAML file in the directory that is passed to kubectl. This file can be modified and reused by running `kubectl apply -f maxtest.yaml` +### Passing custom libtpu or XLA flags ### + +If we want to pass custom flags this is also possible by specifying +`--libtpu_args`. + + +#### Setting flags for SDC checking #### + +Useful checking for the existence of SDC on TPU hardware. + +``` +bash maxtest.sh --project $TPU_PROJECT --cluster $CLUSTER --region $REGION --nodepool $NODEPOOL_NAME --num_workers $NUM_WORKERS --libtpu_args '--xla_tpu_enable_sdc_checker' +``` + + ### Debugging common job errors ### If the job does not exit with `EXIT_CODE=0`, there is a failure among one of diff --git a/benchmarks/maxtest/maxtest.sh b/benchmarks/maxtest/maxtest.sh index 3a6a625c8..927dc9fe4 100644 --- a/benchmarks/maxtest/maxtest.sh +++ b/benchmarks/maxtest/maxtest.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!bin/bash function usage() { echo "error: $1" @@ -15,6 +15,7 @@ while [[ "$#" > 0 ]]; do case $1 in -r|--region) GKE_REGION="$2";shift;shift;; --nodepool) NODEPOOL="$2";shift;shift;; --num_workers) NUM_WORKERS="$2";shift;shift;; + --libtpu_args) LIBTPU_ARGS="$2";shift;shift;; *) usage "Unknown parameter passed: $1"; shift; shift;; esac; done @@ -32,19 +33,20 @@ if [ -z "$TPU_ACCELERATOR" ]; then exit; fi; UUID=$(uuidgen) export JOB_NAME="${UUID:0:5}-maxtest" -export DOCKER_IMAGE="gcr.io/cloud-tpu-images-public/tpu/healthscan" +export DOCKER_IMAGE="us-docker.pkg.dev/cloud-tpu-images-public/tpu/healthscan:latest" export NODEPOOL export TPU_TOPOLOGY export TPU_ACCELERATOR export GKE_PROJECT export GKE_REGION export GKE_CLUSTER +export LIBTPU_ARGS export MEMORY_PER_HOST="407Gi" export TPU_CHIPS_PER_HOST=4 export COMPLETIONS=$NUM_WORKERS # Number of VMs in the nodepool (v6e -> 2 VMs for v6e-8, v5p -> 1 VM for a v5p-8) -YAML_VARS='$JOB_NAME $DOCKER_IMAGE $NODEPOOL $TPU_TOPOLOGY $TPU_ACCELERATOR $COMPLETIONS $MEMORY_PER_HOST $TPU_CHIPS_PER_HOST $GKE_PROJECT $GKE_REGION $GKE_CLUSTER' +YAML_VARS='$JOB_NAME $DOCKER_IMAGE $NODEPOOL $TPU_TOPOLOGY $TPU_ACCELERATOR $COMPLETIONS $MEMORY_PER_HOST $TPU_CHIPS_PER_HOST $GKE_PROJECT $GKE_REGION $GKE_CLUSTER $LIBTPU_ARGS' envsubst "${YAML_VARS}" < maxtest.yaml.template > maxtest.yaml diff --git a/benchmarks/maxtest/maxtest.yaml.template b/benchmarks/maxtest/maxtest.yaml.template index 803ececa4..fd7a1a328 100644 --- a/benchmarks/maxtest/maxtest.yaml.template +++ b/benchmarks/maxtest/maxtest.yaml.template @@ -42,7 +42,7 @@ spec: _sigterm() (kill -SIGTERM $! 2>/dev/null;); trap _sigterm SIGTERM; - (export TPU_STDERR_LOG_LEVEL=0 && export TPU_MIN_LOG_LEVEL=0 && export TF_CPP_MIN_LOG_LEVEL=0 && python3 -m benchmarks.benchmark_runner healthscan --device_type=$TPU_ACCELERATOR_TYPE --base_output_directory=gke-healthscan-output --num_steps=5) & PID=$1; + (export TPU_STDERR_LOG_LEVEL=0 && export TPU_MIN_LOG_LEVEL=0 && export TF_CPP_MIN_LOG_LEVEL=0 && echo LIBTPU_INIT_ARGS='$LIBTPU_ARGS' && export LIBTPU_INIT_ARGS='$LIBTPU_ARGS' && python3 -m benchmarks.benchmark_runner healthscan --device_type=$TPU_ACCELERATOR_TYPE --base_output_directory=gke-healthscan-output --num_steps=5) & PID=$1; while kill -0 $PID 2>/dev/null; do sleep 5;