diff --git a/README.md b/README.md index 9b1d01e27..64e48731c 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ cleanup with a `Cluster Delete`. * Cluster Create (provision on-demand capacity): ```shell - python3 xpk/xpk.py cluster create \ + python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-16 \ --num-slices=4 ``` @@ -67,7 +67,7 @@ cleanup with a `Cluster Delete`. * Cluster Create (provision reserved capacity): ```shell - python3 xpk/xpk.py cluster create \ + python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-256 \ --num-slices=2 \ --custom-tpu-nodepool-arguments="--reservation-affinity=specific --reservation=RESERVATION_ID" @@ -79,7 +79,7 @@ cleanup with a `Cluster Delete`. For example, if a user creates a cluster with 4 slices: ```shell - python3 xpk/xpk.py cluster create \ + python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-16 \ --num-slices=4 ``` @@ -88,7 +88,7 @@ cleanup with a `Cluster Delete`. new slices: ```shell - python3 xpk/xpk.py cluster create \ + python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-16 \ --num-slices=8 ``` @@ -98,13 +98,13 @@ cleanup with a `Cluster Delete`. Use `--force` to skip prompts. ```shell - python3 xpk/xpk.py cluster create \ + python3 xpk.py cluster create \ --cluster xpk-test --tpu-type=v5litepod-16 \ --num-slices=6 # Skip delete prompts using --force. - python3 xpk/xpk.py cluster create --force \ + python3 xpk.py cluster create --force \ --cluster xpk-test --tpu-type=v5litepod-16 \ --num-slices=6 @@ -113,20 +113,20 @@ cleanup with a `Cluster Delete`. * Cluster Delete (deprovision capacity): ```shell - python3 xpk/xpk.py cluster delete \ + python3 xpk.py cluster delete \ --cluster xpk-test ``` ## Cluster List * Cluster List (see provisioned capacity): ```shell - python3 xpk/xpk.py cluster list + python3 xpk.py cluster list ``` ## Cluster Describe * Cluster Describe (see capacity): ```shell - python3 xpk/xpk.py cluster describe \ + python3 xpk.py cluster describe \ --cluster xpk-test ``` @@ -134,7 +134,7 @@ cleanup with a `Cluster Delete`. * Cluster Cacheimage (enables faster start times): ```shell - python3 xpk/xpk.py cluster cacheimage \ + python3 xpk.py cluster cacheimage \ --cluster xpk-test --docker-image gcr.io/your_docker_image ``` @@ -142,7 +142,7 @@ cleanup with a `Cluster Delete`. * Workload Create (submit training job): ```shell - python3 xpk/xpk.py workload create \ + python3 xpk.py workload create \ --workload xpk-test-workload --command "echo goodbye" --cluster \ xpk-test --tpu-type=v5litepod-16 ``` @@ -151,7 +151,7 @@ cleanup with a `Cluster Delete`. * Workload Delete (delete training job): ```shell - python3 xpk/xpk.py workload delete \ + python3 xpk.py workload delete \ --workload xpk-test-workload --cluster xpk-test ``` @@ -159,7 +159,7 @@ cleanup with a `Cluster Delete`. * Workload List (see training jobs): ```shell - python3 xpk/xpk.py workload list \ + python3 xpk.py workload list \ --cluster xpk-test ``` @@ -179,19 +179,19 @@ This flow pulls the `--script-dir` into the `--base-docker-image` and runs the n - `--script-dir` sets which directory to pull into the image. This defaults to the current working directory. - See `python3 xpk/xpk.py workload create --help` for more info. + See `python3 xpk.py workload create --help` for more info. * Example with defaults which pulls the local directory into the base image: ```shell echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh - python3 xpk/xpk.py workload create --cluster xpk-test \ + python3 xpk.py workload create --cluster xpk-test \ --workload xpk-test-workload-base-image --command "bash test.sh" \ --tpu-type=v5litepod-16 --num-slices=1 ``` * Recommended Flow For Normal Sized Jobs (fewer than 10k accelerators): ```shell - python3 xpk/xpk.py workload create --cluster xpk-test \ + python3 xpk.py workload create --cluster xpk-test \ --workload xpk-test-workload-base-image --command "bash custom_script.sh" \ --base-docker-image=gcr.io/your_dependencies_docker_image \ --tpu-type=v5litepod-16 --num-slices=1 @@ -204,17 +204,17 @@ workload. * Running with `--docker-image`: ```shell - python3 xpk/xpk.py workload create --cluster xpk-test \ + python3 xpk.py workload create --cluster xpk-test \ --workload xpk-test-workload-base-image --command "bash test.sh" \ --tpu-type=v5litepod-16 --num-slices=1 --docker-image=gcr.io/your_docker_image ``` * Recommended Flow For Large Sized Jobs (more than 10k accelerators): ```shell - python3 xpk/xpk.py cluster cacheimage \ + python3 xpk.py cluster cacheimage \ --cluster xpk-test --docker-image gcr.io/your_docker_image # Run workload create with the same image. - python3 xpk/xpk.py workload create --cluster xpk-test \ + python3 xpk.py workload create --cluster xpk-test \ --workload xpk-test-workload-base-image --command "bash test.sh" \ --tpu-type=v5litepod-16 --num-slices=1 --docker-image=gcr.io/your_docker_image ``` @@ -245,5 +245,5 @@ Please select a CPU type that exists in all zones in the region. # Find CPU Types supported in zones. gcloud compute machine-types list --zones=$ZONE_LIST # Adjust default cpu machine type. -python3 xpk/xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ... +python3 xpk.py cluster create --cluster-cpu-machine-type=CPU_TYPE ... ``` diff --git a/Run_MaxText.md b/Run_MaxText.md index 3ffff7f27..cedc7a464 100644 --- a/Run_MaxText.md +++ b/Run_MaxText.md @@ -72,9 +72,11 @@ sudo apt update && sudo apt-get install google-cloud-sdk-gke-gcloud-auth-plugin gcloud config set project $PROJECT_ID gcloud config set compute/zone $ZONE - # Make sure you are in the maxtext github root directory when running this command + # Make sure you are in the xpk github root directory when running this command + git clone https://github.com/google/xpk.git + cd xpk - python3 xpk/xpk.py workload create \ + python3 xpk.py workload create \ --cluster ${CLUSTER_NAME} \ --base-docker-image gcr.io/${PROJECT_ID}/${USER}_runner \ --workload ${USER}-first-job \ diff --git a/xpk-large-scale-guide.sh b/xpk-large-scale-guide.sh index 9e04952d1..0ec01af5c 100644 --- a/xpk-large-scale-guide.sh +++ b/xpk-large-scale-guide.sh @@ -137,20 +137,19 @@ export TPU_NODEPOOL_ARGUMENTS=" \ # Git clone and go to the correct directory. ##### 2B ##################### -git clone https://github.com/google/maxtext.git && cd maxtext - +git clone https://github.com/google/xpk.git && cd xpk ##### 2C ##################### # Confirm that variables are correctly set: ##### 2C ##################### -echo python3 xpk/xpk.py cluster create \ +echo python3 xpk.py cluster create \ --cluster "${CLUSTER}" --tpu-type=v5litepod-256 \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" -# python3 xpk/xpk.py cluster create --cluster NAME \ +# python3 xpk.py cluster create --cluster NAME \ # --tpu-type=v5litepod-256 --num-slices=4 \ # --host-maintenance-interval=PERIODIC \ # --custom-cluster-arguments= --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks @@ -162,7 +161,7 @@ echo python3 xpk/xpk.py cluster create \ ##### 2D ##################### # Rerun create command to update the cluster (with a new slice size) or if the create command fails. -python3 xpk/xpk.py cluster create \ +python3 xpk.py cluster create \ --cluster "${CLUSTER}" --tpu-type=v5litepod-256 \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ @@ -257,7 +256,7 @@ cd ../.. # Cluster create is the same command as run previously in step 2D. It will # not recreate the cluster but just update it. -python3 xpk/xpk.py cluster create \ +python3 xpk.py cluster create \ --cluster "${CLUSTER}" --tpu-type=v5litepod-256 \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ @@ -324,14 +323,14 @@ export TPU_NODEPOOL_ARGUMENTS=" \ # Confirm that variables are correctly set: ##### 5B ##################### -echo python3 xpk/xpk.py cluster create \ +echo python3 xpk.py cluster create \ --cluster "${CLUSTER}" --tpu-type=v5litepod-256 \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ --custom-cluster-arguments="${CLUSTER_ARGUMENTS}" \ --custom-tpu-nodepool-arguments="${TPU_NODEPOOL_ARGUMENTS}" -# python3 xpk/xpk.py cluster create --cluster NAME \ +# python3 xpk.py cluster create --cluster NAME \ # --tpu-type=v5litepod-256 --num-slices=64 \ # --host-maintenance-interval=PERIODIC \ # --custom-cluster-arguments= --network=NETWORK --subnetwork=SUBNET --scopes=storage-full,gke-default --enable-ip-alias --enable-private-nodes --master-ipv4-cidr 172.16.0.32/28 --cluster-ipv4-cidr=10.224.0.0/12 --no-enable-master-authorized-networks @@ -341,7 +340,7 @@ echo python3 xpk/xpk.py cluster create \ # Scale up to NUMSLICES (64 in the provided case) V5e-256s. ##### 5C ##################### -python3 xpk/xpk.py cluster create \ +python3 xpk.py cluster create \ --cluster "${CLUSTER}" --tpu-type=v5litepod-256 \ --num-slices="${NUMSLICES}" \ --host-maintenance-interval=PERIODIC \ @@ -384,7 +383,7 @@ echo "https://console.cloud.google.com/kubernetes/clusters/details/us-central2/$ # Set --scheduler=gke.io/high-throughput-scheduler to use the high throughput scheduler. -python3 xpk/xpk.py workload create \ +python3 xpk.py workload create \ --scheduler=gke.io/high-throughput-scheduler \ --workload xpk-test-workload --command "echo hello world" --cluster ${CLUSTER} \ --tpu-type=v5litepod-256 --num-slices=${NUMSLICES} @@ -428,7 +427,7 @@ python3 xpk/xpk.py workload create \ # Use the link in the above "WORKLOAD_LOGS_LINK" view logs. You should see # the echo command in cloud logs. -python3 xpk/xpk.py workload list \ +python3 xpk.py workload list \ --cluster ${CLUSTER} ############################### @@ -482,7 +481,8 @@ bash docker_upload_runner.sh CLOUD_IMAGE_NAME="${USER}"_runner # Cluster cacheimage to enable faster start times. # XPK offers cacheimage as a wrapper around daemonset. ##### 7C ##################### -python3 xpk/xpk.py cluster cacheimage \ +cd ../xpk +python3 xpk.py cluster cacheimage \ --cluster ${CLUSTER} --docker-image gcr.io/"${PROJECT}"/"${USER}"_runner # [XPK] Starting xpk @@ -505,8 +505,8 @@ python3 xpk/xpk.py cluster cacheimage \ export NUMSLICES=64 -# Make sure you are in the maxtext github root directory when running this command -python3 xpk/xpk.py workload create \ +# Make sure you are in the xpk github root directory when running this command +python3 xpk.py workload create \ --cluster ${CLUSTER} \ --docker-image gcr.io/${PROJECT}/"${USER}"_runner \ --workload "${USER}"-first-job \