diff --git a/PREFLIGHT.md b/PREFLIGHT.md
index 64e0fb530e..d713f10f96 100644
--- a/PREFLIGHT.md
+++ b/PREFLIGHT.md
@@ -7,12 +7,12 @@ Before you run ML workload on Multihost with GCE or GKE, simply apply `bash pref
 
 Here is an example for GCE:
 ```
-bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME
+bash preflight.sh PLATFORM=GCE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?}
 ```
 
 Here is an example for GKE:
 ```
-bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME
+bash preflight.sh PLATFORM=GKE && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?}
 ```
 
 # Optimization 2: Numa binding (You can only apply this to v4 and v5p)
@@ -22,14 +22,14 @@ For GCE,
 [preflight.sh](https://github.com/google/maxtext/blob/main/preflight.sh) will help you install `numactl` dependency, so you can use it directly, here is an example:
 
 ```
-bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME
+bash preflight.sh PLATFORM=GCE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?}
 ```
 
 For GKE,
 `numactl` should be built into your docker image from [maxtext_tpu_dependencies.Dockerfile](https://github.com/google/maxtext/blob/main/dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile), so you can use it directly if you built the maxtext docker image. Here is an example
 
 ```
-bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME
+bash preflight.sh PLATFORM=GKE && numactl --membind 0 --cpunodebind=0 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?}
 ```
 
 1. `numactl`: This is the command-line tool used for controlling NUMA policy for processes or shared memory. It's particularly useful on multi-socket systems where memory locality can impact performance.
diff --git a/benchmarks/Getting_Started_Benchmarking.md b/benchmarks/Getting_Started_Benchmarking.md
index 0db20116a7..fec38ec568 100644
--- a/benchmarks/Getting_Started_Benchmarking.md
+++ b/benchmarks/Getting_Started_Benchmarking.md
@@ -14,7 +14,7 @@ Two approaches are here:
 CLUSTER=my-cluster
 ZONE=my-zone
 PROJECT=my-project
-python3 -m benchmarks.benchmark_runner xpk --project $PROJECT --zone $ZONE --cluster_name $CLUSTER --device_type v6e-256 --base_output_directory gs://maxtext-experiments-tpem/ --num_steps=5
+python3 -m benchmarks.benchmark_runner xpk --project ${PROJECT?} --zone ${ZONE?} --cluster_name ${CLUSTER?} --device_type v6e-256 --base_output_directory gs://maxtext-experiments-tpem/ --num_steps=5
 ```
 
 ```shell
@@ -23,7 +23,7 @@ export RUNNER=us-docker.pkg.dev/path/to/maxtext_runner
 export PROXY_IMAGE=us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server
 export SERVER_IMAGE=us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server
 
-python3 -m benchmarks.benchmark_runner xpk --project $PROJECT --zone $ZONE --cluster_name $CLUSTER --device_type v6e-256 --base_output_directory gs://maxtext-experiments-tpem/ --num_steps=5 --pathways_server_image="${SERVER_IMAGE}" --pathways_proxy_server_image="${PROXY_IMAGE}" --pathways_runner_image="${RUNNER}"
+python3 -m benchmarks.benchmark_runner xpk --project ${PROJECT?} --zone ${ZONE?} --cluster_name ${CLUSTER?} --device_type v6e-256 --base_output_directory gs://maxtext-experiments-tpem/ --num_steps=5 --pathways_server_image="${SERVER_IMAGE?}" --pathways_proxy_server_image="${PROXY_IMAGE?}" --pathways_runner_image="${RUNNER?}"
 ```
 
 ```shell
diff --git a/benchmarks/api_server/README.md b/benchmarks/api_server/README.md
index ff51371e54..4de0624d45 100644
--- a/benchmarks/api_server/README.md
+++ b/benchmarks/api_server/README.md
@@ -131,17 +131,17 @@ export ICI_EXPERT_PARALLELISM=2
 # 2. Define the Command to Run on the Cluster
 # ==============================================================================
 # This command installs dependencies and then starts the server.
-CMD="export HF_TOKEN=${HF_TOKEN} && \
+CMD="export HF_TOKEN=${HF_TOKEN?} && \
      pip install --upgrade pip && \
      pip install -r benchmarks/api_server/requirements.txt && \
      bash benchmarks/api_server/start_server.sh \
         maxtext/configs/base.yml \
-        model_name="${MODEL_NAME}" \
-        tokenizer_path="${TOKENIZER_PATH}" \
-        load_parameters_path="${LOAD_PARAMETERS_PATH}" \
-        per_device_batch_size=${PER_DEVICE_BATCH_SIZE} \
-        ici_tensor_parallelism=${ICI_TENSOR_PARALLELISM} \
-        ici_expert_parallelism=${ICI_EXPERT_PARALLELISM} \
+        model_name="${MODEL_NAME?}" \
+        tokenizer_path="${TOKENIZER_PATH?}" \
+        load_parameters_path="${LOAD_PARAMETERS_PATH?}" \
+        per_device_batch_size=${PER_DEVICE_BATCH_SIZE?} \
+        ici_tensor_parallelism=${ICI_TENSOR_PARALLELISM?} \
+        ici_expert_parallelism=${ICI_EXPERT_PARALLELISM?} \
         tokenizer_type=\"huggingface\" \
         return_log_prob=True"
 
@@ -149,16 +149,16 @@ CMD="export HF_TOKEN=${HF_TOKEN} && \
 # ==============================================================================
 # 3. Launch the Workload
 # ==============================================================================
-echo "Launching workload ${RUNNAME}..."
-xpk workload create --workload "${RUNNAME}" \
-  --base-docker-image "${DOCKER_IMAGE}" \
-  --command "${CMD}" \
+echo "Launching workload ${RUNNAME?}..."
+xpk workload create --workload "${RUNNAME?}" \
+  --base-docker-image "${DOCKER_IMAGE?}" \
+  --command "${CMD?}" \
   --num-slices=1  \
-  --cluster "${CLUSTER}" --device-type "${DEVICE_TYPE}" --project "${PROJECT}" --zone "${ZONE}"
+  --cluster "${CLUSTER?}" --device-type "${DEVICE_TYPE?}" --project "${PROJECT?}" --zone "${ZONE?}"
 
-echo "Workload ${RUNNAME} created."
+echo "Workload ${RUNNAME?} created."
 echo "Use the following command to connect:"
-echo "bash benchmarks/api_server/port_forward_xpk.sh job_name=${RUNNAME} project=${PROJECT} zone=${ZONE} cluster=${CLUSTER}"
+echo "bash benchmarks/api_server/port_forward_xpk.sh job_name=${RUNNAME?} project=${PROJECT?} zone=${ZONE?} cluster=${CLUSTER?}"
 ```
 
 ### 2. Launch the Workload
diff --git a/benchmarks/maxtest/getting_started.md b/benchmarks/maxtest/getting_started.md
index 3037a1b022..e4d4993045 100644
--- a/benchmarks/maxtest/getting_started.md
+++ b/benchmarks/maxtest/getting_started.md
@@ -55,7 +55,7 @@ If we want to pass custom flags this is also possible by specifying
 Useful checking for the existence of SDC on TPU hardware.
 
 ```
-bash maxtest.sh --project $TPU_PROJECT --cluster $CLUSTER --region $REGION --nodepool $NODEPOOL_NAME --num_workers $NUM_WORKERS --libtpu_args '--xla_tpu_enable_sdc_checker'
+bash maxtest.sh --project ${TPU_PROJECT?} --cluster ${CLUSTER?} --region ${REGION?} --nodepool ${NODEPOOL_NAME?} --num_workers ${NUM_WORKERS?} --libtpu_args '--xla_tpu_enable_sdc_checker'
 ```
 
 
diff --git a/docs/guides/checkpointing_solutions/convert_checkpoint.md b/docs/guides/checkpointing_solutions/convert_checkpoint.md
index c896a22407..325d9d95ef 100644
--- a/docs/guides/checkpointing_solutions/convert_checkpoint.md
+++ b/docs/guides/checkpointing_solutions/convert_checkpoint.md
@@ -37,8 +37,8 @@ First, make sure python3 virtual environment for MaxText is set up and enabled.
 ```bash
 export VENV_NAME=<your virtual env name> # e.g., maxtext_venv
 pip install uv
-uv venv --python 3.12 --seed $VENV_NAME
-source $VENV_NAME/bin/activate
+uv venv --python 3.12 --seed ${VENV_NAME?}
+source ${VENV_NAME?}/bin/activate
 ```
 
 Second, ensure you have the necessary dependencies installed (PyTorch for the conversion script).
@@ -68,16 +68,16 @@ Finally, run below command to complete the conversion
 
 ```bash
 python3 -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \
-    model_name=${HF_MODEL} \
-    hf_access_token=${HF_TOKEN} \
-    base_output_directory=${MODEL_CHECKPOINT_DIRECTORY} \
+    model_name=${HF_MODEL?} \
+    hf_access_token=${HF_TOKEN?} \
+    base_output_directory=${MODEL_CHECKPOINT_DIRECTORY?} \
     scan_layers=True \
     use_multimodal=false \
     hardware=cpu \
     skip_jax_distributed_system=true \
-    checkpoint_storage_use_zarr3=${USE_ZARR3} \
-    checkpoint_storage_use_ocdbt=${USE_OCDBT} \
-    --lazy_load_tensors=${LAZY_LOAD_TENSORS}
+    checkpoint_storage_use_zarr3=${USE_ZARR3?} \
+    checkpoint_storage_use_ocdbt=${USE_OCDBT?} \
+    --lazy_load_tensors=${LAZY_LOAD_TENSORS?}
 ```
 
 **Key arguments:**
diff --git a/docs/guides/checkpointing_solutions/emergency_checkpointing.md b/docs/guides/checkpointing_solutions/emergency_checkpointing.md
index 0361d560b6..356b3b2f51 100644
--- a/docs/guides/checkpointing_solutions/emergency_checkpointing.md
+++ b/docs/guides/checkpointing_solutions/emergency_checkpointing.md
@@ -75,8 +75,8 @@ In this scenario, you should configure each pod in that slice with a ramdisk of
    ```
 2. **Configure gcloud:**
    ```bash
-   gcloud config set project ${PROJECT_ID}
-   gcloud config set compute/zone ${ZONE}
+   gcloud config set project ${PROJECT_ID?}
+   gcloud config set compute/zone ${ZONE?}
    ```
 3. **Clone the XPK repository:**
    ```bash
@@ -85,15 +85,15 @@ In this scenario, you should configure each pod in that slice with a ramdisk of
 4. **Run the cluster creation command:**
    ```bash
    python3 xpk/xpk.py cluster create \
-   --cluster ${CLUSTER_NAME} \
-   --cluster-cpu-machine-type=${MACHINE_TYPE} \
-   --num-slices=${NUM_SLICES} \
-   --tpu-type=${TPU_TYPE} \
+   --cluster ${CLUSTER_NAME?} \
+   --cluster-cpu-machine-type=${MACHINE_TYPE?} \
+   --num-slices=${NUM_SLICES?} \
+   --tpu-type=${TPU_TYPE?} \
    --enable-mtc \
    --enable-gcsfuse-csi-driver \
-   --mtc-ramdisk-size=${RAMDISK_SIZE} \
-   --mtc-gcs-bucket=${OUTPUT_PATH} \
-   --gke-version=${GKE_VERSION}
+   --mtc-ramdisk-size=${RAMDISK_SIZE?} \
+   --mtc-gcs-bucket=${OUTPUT_PATH?} \
+   --gke-version=${GKE_VERSION?}
    ```
 
 ## MaxText configuration
@@ -150,12 +150,12 @@ The flags below would give the user access to the ramdisk in their workload:
 
    ```bash
    python3 xpk/xpk.py workload create \
-   --cluster ${CLUSTER_NAME} \
-   --docker-image ${DOCKER_IMAGE} \
-   --workload ${WORKLOAD_NAME} \
-   --tpu-type=${TPU_TYPE} \
-   --num-slices=${NUM_SLICES} \
-   --ramdisk-directory=${RAMDISK_DIRECTORY} \
+   --cluster ${CLUSTER_NAME?} \
+   --docker-image ${DOCKER_IMAGE?} \
+   --workload ${WORKLOAD_NAME?} \
+   --tpu-type=${TPU_TYPE?} \
+   --num-slices=${NUM_SLICES?} \
+   --ramdisk-directory=${RAMDISK_DIRECTORY?} \
    --mtc-enabled \
-   --command "python3 src/maxtext/trainers/pre_train/train.py src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH dataset_path=$DATA_PATH steps=120 per_device_batch_size=6 enable_checkpoint_cloud_logger=True checkpoint_period=${CHECKPOINT_PEROID} enable_emergency_checkpoint=True local_checkpoint_period=${LOCAL_CHECKPOINT_PERIOD} local_checkpoint_directory=/${RAMDISK_DIRECTORY}"
+   --command "python3 src/maxtext/trainers/pre_train/train.py src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} dataset_path=${DATA_PATH?} steps=120 per_device_batch_size=6 enable_checkpoint_cloud_logger=True checkpoint_period=${CHECKPOINT_PEROID?} enable_emergency_checkpoint=True local_checkpoint_period=${LOCAL_CHECKPOINT_PERIOD?} local_checkpoint_directory=/${RAMDISK_DIRECTORY?}"
    ```
diff --git a/docs/guides/checkpointing_solutions/multi_tier_checkpointing.md b/docs/guides/checkpointing_solutions/multi_tier_checkpointing.md
index 81d9774de8..b93f1b9a1e 100644
--- a/docs/guides/checkpointing_solutions/multi_tier_checkpointing.md
+++ b/docs/guides/checkpointing_solutions/multi_tier_checkpointing.md
@@ -105,8 +105,8 @@ In this scenario, you should configure each pod in that slice with a ramdisk of
    ```
 2. **Configure gcloud:**
    ```bash
-   gcloud config set project ${PROJECT_ID}
-   gcloud config set compute/zone ${ZONE}
+   gcloud config set project ${PROJECT_ID?}
+   gcloud config set compute/zone ${ZONE?}
    ```
 3. **Clone the XPK repository:**
    ```bash
@@ -115,15 +115,15 @@ In this scenario, you should configure each pod in that slice with a ramdisk of
 4. **Run the cluster creation command:**
    ```bash
    python3 xpk/xpk.py cluster create \
-   --cluster ${CLUSTER_NAME} \
-   --cluster-cpu-machine-type=${MACHINE_TYPE} \
-   --num-slices=${NUM_SLICES} \
-   --tpu-type=${TPU_TYPE} \
+   --cluster ${CLUSTER_NAME?} \
+   --cluster-cpu-machine-type=${MACHINE_TYPE?} \
+   --num-slices=${NUM_SLICES?} \
+   --tpu-type=${TPU_TYPE?} \
    --enable-mtc \
    --enable-gcsfuse-csi-driver \
-   --mtc-ramdisk-size=${RAMDISK_SIZE} \
-   --mtc-gcs-bucket=${OUTPUT_PATH} \
-   --gke-version=${GKE_VERSION}
+   --mtc-ramdisk-size=${RAMDISK_SIZE?} \
+   --mtc-gcs-bucket=${OUTPUT_PATH?} \
+   --gke-version=${GKE_VERSION?}
    ```
 
 ## MaxText configuration
@@ -179,12 +179,12 @@ The flags below would give the user access to the ramdisk in their workload:
 
    ```bash
    python3 xpk/xpk.py workload create \
-   --cluster ${CLUSTER_NAME} \
-   --docker-image ${DOCKER_IMAGE} \
-   --workload ${WORKLOAD_NAME} \
-   --tpu-type=${TPU_TYPE} \
-   --num-slices=${NUM_SLICES} \
-   --ramdisk-directory=${RAMDISK_DIRECTORY} \
+   --cluster ${CLUSTER_NAME?} \
+   --docker-image ${DOCKER_IMAGE?} \
+   --workload ${WORKLOAD_NAME?} \
+   --tpu-type=${TPU_TYPE?} \
+   --num-slices=${NUM_SLICES?} \
+   --ramdisk-directory=${RAMDISK_DIRECTORY?} \
    --mtc-enabled  \
-   --command "python3 src/maxtext/trainers/pre_train/train.py src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH dataset_path=$DATA_PATH steps=120 per_device_batch_size=6 enable_checkpoint_cloud_logger=True checkpoint_period=${CHECKPOINT_PEROID} enable_multi_tier_checkpointing=True local_checkpoint_period=${LOCAL_CHECKPOINT_PERIOD} local_checkpoint_directory=/${RAMDISK_DIRECTORY} multi_tier_checkpointing_backup_interval_minutes=${MULTI_TIER_CHECKPOINTING_BACKUP_INT_MIN}"
+   --command "python3 src/maxtext/trainers/pre_train/train.py src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} dataset_path=${DATA_PATH?} steps=120 per_device_batch_size=6 enable_checkpoint_cloud_logger=True checkpoint_period=${CHECKPOINT_PEROID?} enable_multi_tier_checkpointing=True local_checkpoint_period=${LOCAL_CHECKPOINT_PERIOD?} local_checkpoint_directory=/${RAMDISK_DIRECTORY?} multi_tier_checkpointing_backup_interval_minutes=${MULTI_TIER_CHECKPOINTING_BACKUP_INT_MIN?}"
    ```
diff --git a/docs/guides/data_input_pipeline/data_input_grain.md b/docs/guides/data_input_pipeline/data_input_grain.md
index 625f8a4648..63c60482e8 100644
--- a/docs/guides/data_input_pipeline/data_input_grain.md
+++ b/docs/guides/data_input_pipeline/data_input_grain.md
@@ -38,9 +38,9 @@ Grain ensures determinism in data input pipelines by saving the pipeline's state
 
 ```sh
 bash tools/setup/setup_gcsfuse.sh \
-DATASET_GCS_BUCKET=$BUCKET_NAME \
-MOUNT_PATH=$MOUNT_PATH \
-[FILE_PATH=$MOUNT_PATH/my_dataset]
+DATASET_GCS_BUCKET=${BUCKET_NAME?} \
+MOUNT_PATH=${MOUNT_PATH?} \
+[FILE_PATH=${MOUNT_PATH?}/my_dataset]
 ```
 
 Note that `FILE_PATH` is optional; when provided, the script runs `ls -R` for pre-filling the metadata cache (see ["Performance tuning best practices" on the Google Cloud documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/performance#improve-first-time-reads)).
diff --git a/docs/guides/monitoring_and_debugging/monitor_goodput.md b/docs/guides/monitoring_and_debugging/monitor_goodput.md
index 73234648a8..62bbb7e04a 100644
--- a/docs/guides/monitoring_and_debugging/monitor_goodput.md
+++ b/docs/guides/monitoring_and_debugging/monitor_goodput.md
@@ -89,8 +89,8 @@ Please use a unique workload name, unless you intend to monitor cumulative Goodp
 MaxText enables Goodput recording and monitoring by default with `enable_goodput_recording=True` and `monitor_goodput=True`. You can configure the goodput upload frequency by setting `goodput_upload_interval_seconds`.
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH \
-  dataset_path=$DATA_PATH run_name=goodput-test-run steps=200 goodput_upload_interval_seconds=30
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} \
+  dataset_path=${DATA_PATH?} run_name=goodput-test-run steps=200 goodput_upload_interval_seconds=30
 ```
 
 #### How to monitor step time deviation
@@ -98,8 +98,8 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_ou
 MaxText enables step time deviation monitoring by default with `monitor_step_time_deviation=True`. You can configure the upload frequency by setting `step_deviation_interval_seconds`.
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH \
-  dataset_path=$DATA_PATH run_name=goodput-test-run steps=200 step_deviation_interval_seconds=30
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} \
+  dataset_path=${DATA_PATH?} run_name=goodput-test-run steps=200 step_deviation_interval_seconds=30
 ```
 
 #### How to enable Pathways Goodput
@@ -111,7 +111,7 @@ Enabling `enable_pathways_goodput` turns on Goodput measurement for Pathways wor
 ```
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH dataset_path=$DATA_PATH \
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} dataset_path=${DATA_PATH?} \
   run_name=goodput-test-run steps=200 goodput_upload_interval_seconds=30 enable_pathways_goodput=True
 ```
 
@@ -168,7 +168,7 @@ and `enable_gcp_step_deviation_metrics` to `False` for disabling step deviation
 metrics.
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=$OUTPUT_PATH dataset_path=$DATA_PATH \
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml base_output_directory=${OUTPUT_PATH?} dataset_path=${DATA_PATH?} \
   run_name=goodput-test-run steps=200 goodput_upload_interval_seconds=30 enable_gcp_goodput_metrics=False \
   enable_gcp_step_deviation_metrics=False
 ```
diff --git a/docs/reference/core_concepts/quantization.md b/docs/reference/core_concepts/quantization.md
index 1a81b675c5..5312d696a8 100644
--- a/docs/reference/core_concepts/quantization.md
+++ b/docs/reference/core_concepts/quantization.md
@@ -87,7 +87,7 @@ Common options for the `quantization` flag when using Qwix include:
 Here is an example of how to run a training job with int8 quantization enabled via Qwix:
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME base_output_directory=gs://<my-bucket> dataset_type=synthetic use_qwix_quantization=true quantization='int8'
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} base_output_directory=gs://<my-bucket> dataset_type=synthetic use_qwix_quantization=true quantization='int8'
 ```
 
 #### The Qwix Interception API
@@ -142,7 +142,7 @@ When using AQT, you can pass one of the following values to the `quantization` f
 #### Example command for AQT
 
 ```bash
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$YOUR_JOB_NAME base_output_directory=gs://<my-bucket> dataset_type=synthetic use_qwix_quantization=false quantization='int8'
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${YOUR_JOB_NAME?} base_output_directory=gs://<my-bucket> dataset_type=synthetic use_qwix_quantization=false quantization='int8'
 ```
 
 Note that `use_qwix_quantization` is not set to `True`.
diff --git a/docs/run_maxtext/run_maxtext_localhost.md b/docs/run_maxtext/run_maxtext_localhost.md
index d2adc5669b..5f7b428f16 100644
--- a/docs/run_maxtext/run_maxtext_localhost.md
+++ b/docs/run_maxtext/run_maxtext_localhost.md
@@ -59,7 +59,7 @@ After the installation is complete, run a short training job using synthetic dat
 
 ```bash
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -73,7 +73,7 @@ To demonstrate model output, run the following command:
 
 ```bash
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
 ```
@@ -94,7 +94,7 @@ To use a pre-configured model for TPUs, you override the `model_name` parameter,
 ```bash
 python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \
   model_name=llama3-8b \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -108,7 +108,7 @@ python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \
 ```bash
 python3 -m maxtext.trainers.pre_train.train maxtext/configs/base.yml \
   model_name=qwen3-4b \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -125,7 +125,7 @@ To use a GPU-optimized configuration, you should specify the path to the model's
 
 ```bash
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/gpu/models/mixtral_8x7b.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -140,7 +140,7 @@ This will load `gpu/mixtral_8x7b.yml`, which inherits from `base.yml`.
 
 ```bash
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/gpu/models/llama3-8b.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
diff --git a/docs/run_maxtext/run_maxtext_single_host_gpu.md b/docs/run_maxtext/run_maxtext_single_host_gpu.md
index 69f44f0025..51d0c36b1b 100644
--- a/docs/run_maxtext/run_maxtext_single_host_gpu.md
+++ b/docs/run_maxtext/run_maxtext_single_host_gpu.md
@@ -83,8 +83,8 @@ cd maxtext
 ```bash
 export LOCAL_IMAGE_NAME=<docker_image_name>
 sudo bash docker_build_dependency_image.sh DEVICE=gpu
-docker tag maxtext_base_image $LOCAL_IMAGE_NAME
-docker push $LOCAL_IMAGE_NAME
+docker tag maxtext_base_image ${LOCAL_IMAGE_NAME?}
+docker push ${LOCAL_IMAGE_NAME?}
 ```
 
 Note that when running `bash docker_build_dependency_image.sh DEVICE=gpu`, it
@@ -137,11 +137,11 @@ export NNODES=1
 Update script and run the command with synthetic data:
 
 ```
-base_output_directory: A GCS Bucket 
+base_output_directory: A GCS Bucket
 dataset_type: Synthetic or pass a real bucket
 attention:cudnn_flash_te (The default in maxtext is flash. Flash does not work on GPUs)
-scan_layers=False 
-use_iota_embed=True 
+scan_layers=False
+use_iota_embed=True
 hardware=gpu
 per_device_batch_size=12 [Update this to get a better MFU]
 Hardware: GPU
@@ -165,9 +165,9 @@ https://github.com/AI-Hypercomputer/maxtext/tree/main/src/maxtext/configs/gpu/a3
 echo "Running 1vm.sh"
 
 # Example command to invoke this script via XPK
-# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME} \
-# --workload ${WORKLOAD_NAME} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME} \
-# --device-type ${DEVICE_TYPE} --num-slices 1 \
+# python3 xpk/xpk.py workload create --cluster ${CLUSTER_NAME?} \
+# --workload ${WORKLOAD_NAME?} --docker-image=gcr.io/supercomputer-testing/${LOCAL_IMAGE_NAME?} \
+# --device-type ${DEVICE_TYPE?} --num-slices 1 \
 # --command "bash src/maxtext/configs/gpu/a3/llama_2_7b/1vm.sh"
 
 # Stop execution if any command exits with error
@@ -182,7 +182,7 @@ for ARGUMENT in "$@"; do
     export "$KEY"="$VALUE"
 done
 
-export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
+export XLA_FLAGS="--xla_dump_to=${OUTPUT_PATH?}/${RUN_NAME?}/HLO_dumps/
 --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false
  --xla_gpu_enable_command_buffer='' --xla_gpu_enable_highest_priority_async_stream=true
  --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
@@ -194,7 +194,7 @@ export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
 
 
 # 1 node, DATA_DP=1, ICI_FSDP=8
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/gpu/models/llama2_7b.yml run_name=$RUN_NAME dcn_data_parallelism=1 \
-  ici_fsdp_parallelism=8 base_output_directory=$OUTPUT_PATH attention=cudnn_flash_te scan_layers=False \
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/gpu/models/llama2_7b.yml run_name=${RUN_NAME?} dcn_data_parallelism=1 \
+  ici_fsdp_parallelism=8 base_output_directory=${OUTPUT_PATH?} attention=cudnn_flash_te scan_layers=False \
   use_iota_embed=True hardware=gpu
 ```
diff --git a/docs/run_maxtext/run_maxtext_via_multihost_job.md b/docs/run_maxtext/run_maxtext_via_multihost_job.md
index 8074e27ac2..a364a25365 100644
--- a/docs/run_maxtext/run_maxtext_via_multihost_job.md
+++ b/docs/run_maxtext/run_maxtext_via_multihost_job.md
@@ -43,8 +43,8 @@ The `multihost_job.py` script:
    ```
 
    ```
-   gcloud config set project $PROJECT
-   gcloud config set compute/zone $ZONE
+   gcloud config set project ${PROJECT?}
+   gcloud config set compute/zone ${ZONE?}
    ```
 
 3. **Link to a GCS bucket.**
@@ -67,8 +67,8 @@ The `multihost_job.py` script:
    ```
 
    ```sh
-   RUN_NAME=$YOUR_JOB_NAME # You may set this to any unique name for a fresh run.
-   python3 multihost_job.py --NUM_SLICES=$NODE_COUNT --RUN_NAME=$RUN_NAME --BUCKET_NAME=$BUCKET_NAME --CQR_EXTRA_ARGS="--reserved" --COMMAND="bash tools/setup/setup.sh && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$RUN_NAME"
+   RUN_NAME=${YOUR_JOB_NAME?} # You may set this to any unique name for a fresh run.
+   python3 multihost_job.py --NUM_SLICES=${NODE_COUNT?} --RUN_NAME=${RUN_NAME?} --BUCKET_NAME=${BUCKET_NAME?} --CQR_EXTRA_ARGS="--reserved" --COMMAND="bash tools/setup/setup.sh && python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${RUN_NAME?}"
    ```
 
    We tell `multihost_job` to target the `reserved` pool by by including `--reserved` as extra arguments to the CQR request, but you may instead target the `on-demand` pool by removing the `--CQR_EXTRA_ARGS` flag (on-demand is default), or the pre-emptible pool with `--CQR_EXTRA_ARGS="--best-effort"`, which may be necessary if your reservation is full.
diff --git a/docs/run_maxtext/run_maxtext_via_multihost_runner.md b/docs/run_maxtext/run_maxtext_via_multihost_runner.md
index c5f284a8d8..13688a8065 100644
--- a/docs/run_maxtext/run_maxtext_via_multihost_runner.md
+++ b/docs/run_maxtext/run_maxtext_via_multihost_runner.md
@@ -47,8 +47,8 @@ Although there are several steps below, most are for the initial setup. Once set
    ```
 
    ```
-   gcloud config set project $PROJECT
-   gcloud config set compute/zone $ZONE
+   gcloud config set project ${PROJECT?}
+   gcloud config set compute/zone ${ZONE?}
    ```
 
    Create ssh keys for gcloud, we recommend leaving a blank password (hit enter twice after running the below command). If you are prompted that the the file already exists you can choose not to overwrite by selecting "n".
@@ -74,7 +74,7 @@ Although there are several steps below, most are for the initial setup. Once set
    Create a multislice environment of nodes using create queued resources
 
    ```
-   gcloud alpha compute tpus queued-resources create $QR_ID --accelerator-type=v4-8 --runtime-version=tpu-ubuntu2204-base --node-count=$NODE_COUNT --node-prefix=$TPU_PREFIX  --reserved
+   gcloud alpha compute tpus queued-resources create ${QR_ID?} --accelerator-type=v4-8 --runtime-version=tpu-ubuntu2204-base --node-count=${NODE_COUNT?} --node-prefix=${TPU_PREFIX?}  --reserved
    ```
 
    We target the `reserved` pool above, but you may instead target the `on-demand` pool by omitting this flag,
@@ -83,14 +83,14 @@ Although there are several steps below, most are for the initial setup. Once set
    You have to wait for the QR to become `ACTIVE` (as opposed to `ACCEPTED` or `PROVISIONING`) which corresponds to the worker nodes becoming `READY` (as opposed to `CREATING`). This may take a minute or two and can be checked via
 
    ```
-   gcloud alpha compute tpus queued-resources list --filter=$QR_ID
+   gcloud alpha compute tpus queued-resources list --filter=${QR_ID?}
    ```
 
 4. **Install dependencies.**
    Install the dependencies of `train.py` on each worker using `multihost_runner.py`:
 
    ```
-   python3 multihost_runner.py --TPU_PREFIX=$TPU_PREFIX --COMMAND="bash tools/setup/setup.sh"
+   python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX?} --COMMAND="bash tools/setup/setup.sh"
    ```
 
    If you are running the `multihost_runner.py` script from a TPUVM, you will need to set `--INTERNAL_IP=true`.
@@ -106,7 +106,7 @@ Although there are several steps below, most are for the initial setup. Once set
    Set config values for `base_output_directory` and `dataset_path` in `configs/base.yml` if not set already.
 
    ```
-   python3 multihost_runner.py --TPU_PREFIX=$TPU_PREFIX --COMMAND="python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=$RUN_NAME"
+   python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX?} --COMMAND="python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${RUN_NAME?}"
    ```
 
    If you are running the `multihost_runner.py` script from a TPUVM, you will need to set `--INTERNAL_IP=true`.
@@ -114,7 +114,7 @@ Although there are several steps below, most are for the initial setup. Once set
 6. **Clean up TPUs and QR when finished.**
 
    ```
-   gcloud alpha compute tpus queued-resources delete $QR_ID --force --async
+   gcloud alpha compute tpus queued-resources delete ${QR_ID?} --force --async
    ```
 
    The `--force` flag deletes both the queued resources and the TPU VMs, without it only a `SUSPENDED` queued resource whose TPUs have already been deleted can itself be deleted. We highly recommend the `--async` flag since deleting the TPUs and QR will take a minute or two.
diff --git a/docs/run_maxtext/run_maxtext_via_pathways.md b/docs/run_maxtext/run_maxtext_via_pathways.md
index f022c068c9..6da385ad98 100644
--- a/docs/run_maxtext/run_maxtext_via_pathways.md
+++ b/docs/run_maxtext/run_maxtext_via_pathways.md
@@ -74,7 +74,7 @@ export WORKLOAD_NODEPOOL_COUNT=1 # Number of TPU slices for your job
 export BUCKET_NAME="your-gcs-bucket-name"
 export RUN_NAME="maxtext-run-1"
 # The Docker image you pushed in the prerequisite step
-export DOCKER_IMAGE="gcr.io/${PROJECT}/${USER}_runner"
+export DOCKER_IMAGE="gcr.io/${PROJECT?}/${USER}_runner"
 ```
 
 ## 3. Running a batch workload
@@ -87,20 +87,20 @@ Use the `xpk workload create-pathways` command to start the job.
 
 ```bash
 xpk workload create-pathways \
-  --workload=$WORKLOAD_NAME \
-  --cluster=$CLUSTER \
-  --num-slices=$WORKLOAD_NODEPOOL_COUNT \
-  --tpu-type=$TPU_TYPE \
-  --project=$PROJECT \
-  --zone=$ZONE \
-  --docker-image=${DOCKER_IMAGE} \
+  --workload=${WORKLOAD_NAME?} \
+  --cluster=${CLUSTER?} \
+  --num-slices=${WORKLOAD_NODEPOOL_COUNT?} \
+  --tpu-type=${TPU_TYPE?} \
+  --project=${PROJECT?} \
+  --zone=${ZONE?} \
+  --docker-image=${DOCKER_IMAGE?} \
   --command="python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=gs://${BUCKET_NAME} \
+    base_output_directory=gs://${BUCKET_NAME?} \
     per_device_batch_size=1 \
     enable_checkpointing=false \
     dataset_type=synthetic \
     enable_single_controller=True \
-    run_name=${RUN_NAME}-pathways-batch"
+    run_name=${RUN_NAME?}-pathways-batch"
 ```
 
 ### Verify the workload
@@ -108,7 +108,7 @@ xpk workload create-pathways \
 You can check the status of your running workloads with the `xpk workload list` command.
 
 ```bash
-xpk workload list --cluster=$CLUSTER --project=$PROJECT --zone=$ZONE
+xpk workload list --cluster=${CLUSTER?} --project=${PROJECT?} --zone=${ZONE?}
 ```
 
 ## 4. Running a headless (interactive) workload
@@ -122,12 +122,12 @@ This command reserves the TPUs and starts the Pathways head service on the clust
 ```bash
 xpk workload create-pathways \
   --headless \
-  --workload=${WORKLOAD_NAME} \
-  --num-slices=${WORKLOAD_NODEPOOL_COUNT} \
-  --tpu-type=${TPU_TYPE} \
-  --project=${PROJECT} \
-  --zone=${ZONE} \
-  --cluster=${CLUSTER}
+  --workload=${WORKLOAD_NAME?} \
+  --num-slices=${WORKLOAD_NODEPOOL_COUNT?} \
+  --tpu-type=${TPU_TYPE?} \
+  --project=${PROJECT?} \
+  --zone=${ZONE?} \
+  --cluster=${CLUSTER?}
 ```
 
 ### Step 2: Connect to the cluster via port forwarding
@@ -138,7 +138,7 @@ This command forwards local port 29000 to the controller pod in the cluster. It
 
 ```bash
 kubectl port-forward \
-  "$(kubectl get pods -o name | grep ${WORKLOAD_NAME}-pathways-head)" \
+  "$(kubectl get pods -o name | grep ${WORKLOAD_NAME?}-pathways-head)" \
   29000:29000 &> /dev/null &
 ```
 
@@ -153,12 +153,12 @@ export JAX_BACKEND_TARGET=grpc://127.0.0.1:29000
 
 # Run the training script
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-  base_output_directory=gs://${BUCKET_NAME} \
+  base_output_directory=gs://${BUCKET_NAME?} \
   per_device_batch_size=1 \
   enable_checkpointing=false \
   dataset_type=synthetic \
   enable_single_controller=True \
-  run_name=${RUN_NAME}-pathways-headless
+  run_name=${RUN_NAME?}-pathways-headless
 ```
 
 The output streams directly to your terminal, just as if you were running on a local accelerator.
@@ -171,7 +171,7 @@ The output streams directly to your terminal, just as if you were running on a l
   - Ensure you have successfully pushed the image to your project's Artifact Registry.
   - Check that your GKE cluster has permissions to pull from the registry.
 - **`kubectl port-forward` fails**:
-  - Confirm that the pod from Step 1 is running (`kubectl get pods`). The name should match `${WORKLOAD_NAME}-pathways-head-0`.
+  - Confirm that the pod from Step 1 is running (`kubectl get pods`). The name should match `${WORKLOAD_NAME?}-pathways-head-0`.
   - Ensure you are authenticated with `kubectl` and have the correct context set for your GKE cluster.
 - Make sure you import `pathwaysutils` package and call `pathwaysutils.initialize()` in your script when running the workload.
 
diff --git a/docs/run_maxtext/run_maxtext_via_xpk.md b/docs/run_maxtext/run_maxtext_via_xpk.md
index 2678ac2bf8..9ad05000a9 100644
--- a/docs/run_maxtext/run_maxtext_via_xpk.md
+++ b/docs/run_maxtext/run_maxtext_via_xpk.md
@@ -162,8 +162,8 @@ This guide focuses on submitting workloads to an existing cluster. Cluster creat
 2. **Configure gcloud CLI**
 
    ```
-   gcloud config set project $PROJECT_ID
-   gcloud config set compute/zone $ZONE
+   gcloud config set project ${PROJECT_ID?}
+   gcloud config set compute/zone ${ZONE?}
    ```
 
 ### A Note on multi-slice and multi-node runs
@@ -178,24 +178,24 @@ For instance, to run a job across **four TPU slices**, you would change `--num-s
 
      ```
      xpk workload create\
-       --cluster ${CLUSTER_NAME}\
+       --cluster ${CLUSTER_NAME?}\
        --workload ${USER}-tpu-job\
        --base-docker-image maxtext_base_image\
        --tpu-type v5litepod-256\
        --num-slices 1\
-       --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-tpu-job base_output_directory=${BASE_OUTPUT_DIR} dataset_path=${DATASET_PATH} steps=100"
+       --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-tpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100"
      ```
 
    - **On your GPU cluster:**
 
      ```
      xpk workload create\
-       --cluster ${CLUSTER_NAME}\
+       --cluster ${CLUSTER_NAME?}\
        --workload ${USER}-gpu-job\
        --base-docker-image maxtext_base_image\
        --device-type h100-80gb-8\
        --num-nodes 2\
-       --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-gpu-job base_output_directory=${BASE_OUTPUT_DIR} dataset_path=${DATASET_PATH} steps=100"
+       --command "python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml run_name=${USER}-gpu-job base_output_directory=${BASE_OUTPUT_DIR?} dataset_path=${DATASET_PATH?} steps=100"
      ```
 
 ______________________________________________________________________
@@ -215,7 +215,7 @@ ______________________________________________________________________
 - **List your jobs:**
 
   ```
-  xpk workload list --cluster ${CLUSTER_NAME}
+  xpk workload list --cluster ${CLUSTER_NAME?}
   ```
 
 - **Analyze output:** Checkpoints and other artifacts will be saved to the Google Cloud Storage bucket you specified in `BASE_OUTPUT_DIR`.
@@ -223,5 +223,5 @@ ______________________________________________________________________
 - **Delete a job:**
 
   ```
-  xpk workload delete --cluster ${CLUSTER_NAME} --workload <your-workload-name>
+  xpk workload delete --cluster ${CLUSTER_NAME?} --workload <your-workload-name>
   ```
diff --git a/docs/tutorials/first_run.md b/docs/tutorials/first_run.md
index 8e0da2693d..7fee3673e1 100644
--- a/docs/tutorials/first_run.md
+++ b/docs/tutorials/first_run.md
@@ -50,7 +50,7 @@ pre-commit install
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -62,7 +62,7 @@ Optional: If you want to try training on a Hugging Face dataset, see [Data Input
 
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
 ```
@@ -84,7 +84,7 @@ You can use [demo_decoding.ipynb](https://github.com/AI-Hypercomputer/maxtext/bl
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   dataset_type=synthetic \
   steps=10
@@ -94,7 +94,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
 
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-  run_name=$YOUR_JOB_NAME \
+  run_name=${YOUR_JOB_NAME?} \
   base_output_directory=gs://<my-bucket> \
   per_device_batch_size=1
 ```
diff --git a/docs/tutorials/posttraining/full_finetuning.md b/docs/tutorials/posttraining/full_finetuning.md
index 7af1586ebc..9455505402 100644
--- a/docs/tutorials/posttraining/full_finetuning.md
+++ b/docs/tutorials/posttraining/full_finetuning.md
@@ -32,8 +32,8 @@ cd maxtext
 # 2. Create virtual environment
 export VENV_NAME=<your virtual env name> # e.g., maxtext_venv
 pip install uv
-uv venv --python 3.12 --seed $VENV_NAME
-source $VENV_NAME/bin/activate
+uv venv --python 3.12 --seed ${VENV_NAME?}
+source ${VENV_NAME?}/bin/activate
 
 # 3. Install dependencies in editable mode
 uv pip install -e .[tpu] --resolution=lowest
@@ -90,7 +90,7 @@ MaxText assumes these GCS buckets are created in the same project and that it ha
 export PROJECT=<Google Cloud Project ID>
 export DATASET_GCS_BUCKET=<GCS for dataset> # e.g., gs://my-bucket/my-dataset
 
-bash tools/data_generation/download_dataset.sh ${PROJECT} ${DATASET_GCS_BUCKET}
+bash tools/data_generation/download_dataset.sh ${PROJECT?} ${DATASET_GCS_BUCKET?}
 ```
 
 The above will download the c4 dataset to the GCS BUCKET.
@@ -102,14 +102,14 @@ Below is a sample training script.
 ```sh
 python3 -m maxtext.trainers.pre_train.train \
   src/maxtext/configs/base.yml \
-  run_name=${RUN_NAME} \
-  base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-  load_parameters_path=${MODEL_CKPT_PATH} \
-  model_name=${MODEL_NAME} \
-  dataset_path=${DATASET_GCS_BUCKET} \
+  run_name=${RUN_NAME?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+  load_parameters_path=${MODEL_CKPT_PATH?} \
+  model_name=${MODEL_NAME?} \
+  dataset_path=${DATASET_GCS_BUCKET?} \
   async_checkpointing=False  \
-  tokenizer_path=${MODEL_TOKENIZER} \
-  hf_access_token=${HF_TOKEN} \
+  tokenizer_path=${MODEL_TOKENIZER?} \
+  hf_access_token=${HF_TOKEN?} \
   steps=10 per_device_batch_size=1
 ```
 
diff --git a/docs/tutorials/posttraining/knowledge_distillation.md b/docs/tutorials/posttraining/knowledge_distillation.md
index 209f39d853..849eb74f84 100644
--- a/docs/tutorials/posttraining/knowledge_distillation.md
+++ b/docs/tutorials/posttraining/knowledge_distillation.md
@@ -69,18 +69,18 @@ export TPU_VM_NAME=<your-tpu-vm-name>
 export DISK_NAME=<your-disk-name>  # e.g., my-hyperdisk
 export DISK_SIZE=<disk-size>  # e.g., 500GB
 
-gcloud compute disks create ${DISK_NAME} \
-  --size=${DISK_SIZE} \
+gcloud compute disks create ${DISK_NAME?} \
+  --size=${DISK_SIZE?} \
   --type=hyperdisk-balanced \
-  --zone=${ZONE}
+  --zone=${ZONE?}
 ```
 
 Then, attach the disk to your TPU VM:
 
 ```bash
-gcloud compute instances attach-disk ${TPU_VM_NAME} \
-  --disk=${DISK_NAME} \
-  --zone=${ZONE}
+gcloud compute instances attach-disk ${TPU_VM_NAME?} \
+  --disk=${DISK_NAME?} \
+  --zone=${ZONE?}
 ```
 
 Inside the TPU VM, format and mount the disk (if not already mounted):
@@ -96,8 +96,8 @@ Update the BASE_DIRECTORY to point to the mounted disk and create the directory:
 
 ```bash
 export BASE_NAME=<your-base-directory>  # e.g., knowledge-distillation
-export BASE_DIRECTORY=/mnt/hyperdisk/${BASE_NAME}
-mkdir -p ${BASE_DIRECTORY}
+export BASE_DIRECTORY=/mnt/hyperdisk/${BASE_NAME?}
+mkdir -p ${BASE_DIRECTORY?}
 ```
 
 > **Note:** This tutorial uses a mounted Hyperdisk for performance and reproducibility, because writing large model files and many small I/O operations directly to `gs://` can be significantly slower.
@@ -109,8 +109,8 @@ For the teacher model, we will use **vLLM** to run inference. vLLM can load Hugg
 You can simply download the model from Hugging Face to your local directory:
 
 ```bash
-huggingface-cli login --token $HF_TOKEN
-huggingface-cli download Qwen/Qwen3-32B --repo-type model --local-dir ${BASE_DIRECTORY}/qwen3-32b
+huggingface-cli login --token ${HF_TOKEN?}
+huggingface-cli download Qwen/Qwen3-32B --repo-type model --local-dir ${BASE_DIRECTORY?}/qwen3-32b
 ```
 
 ### Obtain and prepare the student model
@@ -129,13 +129,13 @@ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
 ```bash
 # Set the checkpoint directory
-export PRE_TRAINED_MODEL_CKPT_DIRECTORY=${BASE_DIRECTORY}/llama3.1-8b-ckpt
+export PRE_TRAINED_MODEL_CKPT_DIRECTORY=${BASE_DIRECTORY?}/llama3.1-8b-ckpt
 
 # Convert to MaxText format
 python3 -m maxtext.checkpoint_conversion.to_maxtext src/maxtext/configs/base.yml \
     model_name=llama3.1-8b \
-    hf_access_token=${HF_TOKEN} \
-    base_output_directory=${PRE_TRAINED_MODEL_CKPT_DIRECTORY} \
+    hf_access_token=${HF_TOKEN?} \
+    base_output_directory=${PRE_TRAINED_MODEL_CKPT_DIRECTORY?} \
     scan_layers=True skip_jax_distributed_system=True
 ```
 
@@ -146,18 +146,18 @@ Use the provided script `generate_distillation_data_vllm.py` to generate the dat
 Run the generation script:
 
 ```bash
-export OUTPUT_DATASET=${BASE_DIRECTORY}/datasets/distillation_data.parquet
+export OUTPUT_DATASET=${BASE_DIRECTORY?}/datasets/distillation_data.parquet
 
 python3 -m tools.data_generation.generate_distillation_data_vllm \
   --dataset-path HuggingFaceH4/ultrachat_200k \
   --data-split train_sft \
   --data-columns messages \
-  --hf-access-token $HF_TOKEN \
-  --teacher-model ${BASE_DIRECTORY}/qwen3-32b \
+  --hf-access-token ${HF_TOKEN?} \
+  --teacher-model ${BASE_DIRECTORY?}/qwen3-32b \
   --use-chat-template \
   --num-prompts 5120 \
   --num-generations 2 \
-  --output-file ${OUTPUT_DATASET}
+  --output-file ${OUTPUT_DATASET?}
 
 ```
 
@@ -171,21 +171,21 @@ Example command to run fine-tuning on a TPU v6e-8:
 
 ```bash
 python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/configs/post_train/sft.yml \
-  run_name=${RUN_NAME} \
-  base_output_directory=${BASE_DIRECTORY}/distillation/qwen3-32b-distill-llama3.1-8b \
+  run_name=${RUN_NAME?} \
+  base_output_directory=${BASE_DIRECTORY?}/distillation/qwen3-32b-distill-llama3.1-8b \
   tokenizer_path=meta-llama/Llama-3.1-8B-Instruct tokenizer_type=huggingface \
   dataset_type=hf \
   hf_path=parquet \
-  hf_train_files=${OUTPUT_DATASET} \
+  hf_train_files=${OUTPUT_DATASET?} \
   train_split='train' \
   train_data_columns=['messages'] \
-  load_parameters_path=${PRE_TRAINED_MODEL_CKPT_DIRECTORY}/0/items \
+  load_parameters_path=${PRE_TRAINED_MODEL_CKPT_DIRECTORY?}/0/items \
   model_name=llama3.1-8b \
   per_device_batch_size=2 \
   steps=200 \
   ici_expert_parallelism=-1 ici_fsdp_parallelism=4 \
   max_target_length=2048 \
-  hf_access_token=$HF_TOKEN \
+  hf_access_token=${HF_TOKEN?} \
   profiler=xplane
 ```
 
@@ -195,8 +195,8 @@ The checkpoint from the student model's fine-tuning (on the teacher-generated da
 
 ```bash
 # Get the latest checkpoint for fine-tuned student model
-CHECKPOINTS_PATH=${BASE_DIRECTORY}/distillation/qwen3-32b-distill-llama3.1-8b/${RUN_NAME}/checkpoints
-checkpoints=$(ls $CHECKPOINTS_PATH)
+CHECKPOINTS_PATH=${BASE_DIRECTORY?}/distillation/qwen3-32b-distill-llama3.1-8b/${RUN_NAME?}/checkpoints
+checkpoints=$(ls ${CHECKPOINTS_PATH?})
 integer_dirs=()
 for dir in $checkpoints; do
   dir_name=$(basename "$dir")
@@ -206,23 +206,23 @@ for dir in $checkpoints; do
 done
 sorted_dirs=($(printf '%s\n' "${integer_dirs[@]}" | sort -n))
 largest_dir="${sorted_dirs[-1]}"
-FINE_TUNED_MODEL_CKPT_PATH=${CHECKPOINTS_PATH}/${largest_dir}/model_params
+FINE_TUNED_MODEL_CKPT_PATH=${CHECKPOINTS_PATH?}/${largest_dir}/model_params
 
 # Fine-tune student model on original dataset
 python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \
-  run_name=${RUN_NAME}_stage2 \
-  base_output_directory=${BASE_DIRECTORY}/distillation/qwen3-32b-distill-llama3.1-8b \
+  run_name=${RUN_NAME?}_stage2 \
+  base_output_directory=${BASE_DIRECTORY?}/distillation/qwen3-32b-distill-llama3.1-8b \
   tokenizer_path=meta-llama/Llama-3.1-8B-Instruct tokenizer_type=huggingface \
   dataset_type=hf \
   hf_path='HuggingFaceH4/ultrachat_200k' \
   train_split='train_sft' \
   train_data_columns=['messages'] \
-  load_parameters_path=${FINE_TUNED_MODEL_CKPT_PATH} \
+  load_parameters_path=${FINE_TUNED_MODEL_CKPT_PATH?} \
   model_name=llama3.1-8b \
   per_device_batch_size=2 \
   steps=200 \
   ici_expert_parallelism=-1 ici_fsdp_parallelism=4 \
   max_target_length=2048 \
-  hf_access_token=$HF_TOKEN \
+  hf_access_token=${HF_TOKEN?} \
   profiler=xplane
 ```
diff --git a/docs/tutorials/posttraining/multimodal.md b/docs/tutorials/posttraining/multimodal.md
index 9c4f60b289..df658b88d2 100644
--- a/docs/tutorials/posttraining/multimodal.md
+++ b/docs/tutorials/posttraining/multimodal.md
@@ -40,8 +40,8 @@ export HF_ACCESS_TOKEN=hf_...
 export MAXTEXT_CKPT_GCS_PATH=gs://...
 python -m maxtext.checkpoint_conversion.to_maxtext maxtext/configs/base.yml \
     model_name=gemma3-4b \
-    hf_access_token=$HF_ACCESS_TOKEN \
-    base_output_directory=$MAXTEXT_CKPT_GCS_PATH \
+    hf_access_token=${HF_ACCESS_TOKEN?} \
+    base_output_directory=${MAXTEXT_CKPT_GCS_PATH?} \
     use_multimodal=true \
     scan_layers=false
 ```
@@ -54,8 +54,8 @@ export MAXTEXT_CKPT_GCS_PATH=gs://...
 python -m maxtext.checkpoint_conversion.standalone_scripts.llama4_ckpt_unscanned \
     --model-size=llama4-17b-16e \
     --huggingface-checkpoint=True \
-    --base-model-path=$LOCAL_HF_MODEL_PATH \
-    --maxtext-model-path=$MAXTEXT_CKPT_GCS_PATH
+    --base-model-path=${LOCAL_HF_MODEL_PATH?} \
+    --maxtext-model-path=${MAXTEXT_CKPT_GCS_PATH?}
 ```
 
 ## Multimodal Decode
@@ -75,9 +75,9 @@ To run a forward pass and verify the model's output, use the following command:
 python -m maxtext.inference.decode \
     maxtext/configs/base.yml \
     model_name=gemma3-4b \
-    hf_access_token=$HF_ACCESS_TOKEN \
+    hf_access_token=${HF_ACCESS_TOKEN?} \
     tokenizer_path=src/maxtext/assets/tokenizers/tokenizer.gemma3 \
-    load_parameters_path=$MAXTEXT_CKPT_GCS_PATH/0/items \
+    load_parameters_path=${MAXTEXT_CKPT_GCS_PATH?}/0/items \
     per_device_batch_size=1 \
     run_name=ht_test \
     max_prefill_predict_length=272 \
@@ -112,8 +112,8 @@ python -m maxtext.inference.decode \
     maxtext/configs/base.yml \
     model_name=gemma3-4b \
     ... \
-    max_prefill_predict_length=$PREDICT_LENGTH  # Adjust to fit image tokens + text prompt \
-    max_target_length=$TARGET_LENGTH \
+    max_prefill_predict_length=${PREDICT_LENGTH?}  # Adjust to fit image tokens + text prompt \
+    max_target_length=${TARGET_LENGTH?} \
     image_path=/path/to/image1.jpg,/path/to/image2.jpg \
     prompt="Describe each image in a short sentence." # <start_of_image> will be added to prompt if not provided
     # or prompt="Describe each image in a short sentence: <start_of_image> and <start_of_image>"
@@ -134,11 +134,11 @@ python -m maxtext.trainers.post_train.sft.train_sft_deprecated \
     run_name="chartqa-sft" \
     model_name=gemma3-4b \
     tokenizer_path="google/gemma-3-4b-it" \
-    hf_access_token=$HF_ACCESS_TOKEN \
-    load_parameters_path=$UNSCANNED_CKPT_PATH \
-    base_output_directory=$BASE_OUTPUT_DIRECTORY \
+    hf_access_token=${HF_ACCESS_TOKEN?} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH?} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
     per_device_batch_size=1 \
-    steps=$STEPS \
+    steps=${STEPS?} \
     max_prefill_predict_length=1024 \
     max_target_length=2048 \
     checkpoint_period=1000 \
diff --git a/docs/tutorials/posttraining/rl.md b/docs/tutorials/posttraining/rl.md
index 272a35ec2a..7f2c366c26 100644
--- a/docs/tutorials/posttraining/rl.md
+++ b/docs/tutorials/posttraining/rl.md
@@ -48,8 +48,8 @@ Let's get started!
 # Create a virtual environment
 export VENV_NAME=<your virtual env name> # e.g., maxtext_venv
 pip install uv
-uv venv --python 3.12 --seed $VENV_NAME
-source $VENV_NAME/bin/activate
+uv venv --python 3.12 --seed ${VENV_NAME?}
+source ${VENV_NAME?}/bin/activate
 ```
 
 ### Option 1: From PyPI releases (Recommended)
@@ -134,13 +134,13 @@ Run the following command for GRPO:
 
 ```
 python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
-  model_name=${MODEL} \
-  tokenizer_path=${TOKENIZER} \
-  load_parameters_path=${MAXTEXT_CKPT_PATH} \
-  run_name=${RUN_NAME} \
-  base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-  hf_access_token=${HF_TOKEN} \
-  chips_per_vm=${CHIPS_PER_VM}
+  model_name=${MODEL?} \
+  tokenizer_path=${TOKENIZER?} \
+  load_parameters_path=${MAXTEXT_CKPT_PATH?} \
+  run_name=${RUN_NAME?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+  hf_access_token=${HF_TOKEN?} \
+  chips_per_vm=${CHIPS_PER_VM?}
 ```
 
 The overview of what this run will do is as follows:
@@ -158,14 +158,14 @@ Run the following command for GSPO:
 
 ```
 python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
-  model_name=${MODEL} \
-  tokenizer_path=${TOKENIZER} \
-  load_parameters_path=${MAXTEXT_CKPT_PATH} \
-  run_name=${RUN_NAME} \
-  base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-  hf_access_token=${HF_TOKEN} \
+  model_name=${MODEL?} \
+  tokenizer_path=${TOKENIZER?} \
+  load_parameters_path=${MAXTEXT_CKPT_PATH?} \
+  run_name=${RUN_NAME?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+  hf_access_token=${HF_TOKEN?} \
   loss_algo=gspo-token \
-  chips_per_vm=${CHIPS_PER_VM}
+  chips_per_vm=${CHIPS_PER_VM?}
 ```
 
 The overview of what this run will do is as follows:
diff --git a/docs/tutorials/posttraining/rl_on_multi_host.md b/docs/tutorials/posttraining/rl_on_multi_host.md
index 3693508ec3..ca86efb0ff 100644
--- a/docs/tutorials/posttraining/rl_on_multi_host.md
+++ b/docs/tutorials/posttraining/rl_on_multi_host.md
@@ -76,7 +76,7 @@ export HF_TOKEN=<Hugging Face access token>
 # -- MaxText configuration --
 export BASE_OUTPUT_DIRECTORY=<output directory to store run logs> # e.g., gs://my-bucket/my-output-directory
 export WORKLOAD=<Name for this run> # e.g., llama-3-70b-grpo
-export MAXTEXT_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${WORKLOAD}/0/items
+export MAXTEXT_CKPT_PATH=${BASE_OUTPUT_DIRECTORY?}/${WORKLOAD?}/0/items
 
 # -- Workload configuration --
 export TPU_TYPE=<TPU Type> # e.g., 'v5p-128'
@@ -185,7 +185,7 @@ bash dependencies/scripts/docker_build_dependency_image.sh WORKFLOW=post-trainin
 > project administrator if you don't have this permission.
 
 ```bash
-bash dependencies/scripts/docker_upload_runner.sh CLOUD_IMAGE_NAME=${CLOUD_IMAGE_NAME}
+bash dependencies/scripts/docker_upload_runner.sh CLOUD_IMAGE_NAME=${CLOUD_IMAGE_NAME?}
 ```
 
 ## Submit your RL workload via Pathways
@@ -203,35 +203,35 @@ submit the `train_rl.py` script via XPK.
 ### Submit GRPO workload
 
 ```bash
-xpk workload create-pathways --workload $WORKLOAD \
---docker-image gcr.io/$PROJECT_ID/$CLOUD_IMAGE_NAME --cluster $TPU_CLUSTER \
---tpu-type=$TPU_TYPE --num-slices=1 \
---project=$PROJECT_ID --priority=high \
---command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
+xpk workload create-pathways --workload ${WORKLOAD?} \
+--docker-image gcr.io/${PROJECT_ID?}/${CLOUD_IMAGE_NAME?} --cluster ${TPU_CLUSTER?} \
+--tpu-type=${TPU_TYPE?} --num-slices=1 \
+--project=${PROJECT_ID?} --priority=high \
+--command "HF_TOKEN=${HF_TOKEN?} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
 python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
-  model_name=${MODEL} \
-  tokenizer_path=${TOKENIZER} \
-  load_parameters_path=${MAXTEXT_CKPT_PATH} \
-  run_name=${WORKLOAD} \
-  base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-  hf_access_token=${HF_TOKEN}"
+  model_name=${MODEL?} \
+  tokenizer_path=${TOKENIZER?} \
+  load_parameters_path=${MAXTEXT_CKPT_PATH?} \
+  run_name=${WORKLOAD?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+  hf_access_token=${HF_TOKEN?}"
 ```
 
 ### Submit GSPO workload
 
 ```bash
-xpk workload create-pathways --workload $WORKLOAD \
---docker-image gcr.io/$PROJECT_ID/$CLOUD_IMAGE_NAME --cluster $TPU_CLUSTER \
---tpu-type=$TPU_TYPE --num-slices=1 \
---project=$PROJECT_ID --priority=high \
---command "HF_TOKEN=${HF_TOKEN} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
+xpk workload create-pathways --workload ${WORKLOAD?} \
+--docker-image gcr.io/${PROJECT_ID?}/${CLOUD_IMAGE_NAME?} --cluster ${TPU_CLUSTER?} \
+--tpu-type=${TPU_TYPE?} --num-slices=1 \
+--project=${PROJECT_ID?} --priority=high \
+--command "HF_TOKEN=${HF_TOKEN?} TF_CPP_MIN_LOG_LEVEL=0 JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE='1' \
 python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
-  model_name=${MODEL} \
-  tokenizer_path=${TOKENIZER} \
-  load_parameters_path=${MAXTEXT_CKPT_PATH} \
-  run_name=${WORKLOAD} \
-  base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-  hf_access_token=${HF_TOKEN} \
+  model_name=${MODEL?} \
+  tokenizer_path=${TOKENIZER?} \
+  load_parameters_path=${MAXTEXT_CKPT_PATH?} \
+  run_name=${WORKLOAD?} \
+  base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+  hf_access_token=${HF_TOKEN?} \
   loss_algo=gspo-token"
 ```
 
@@ -241,9 +241,9 @@ python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_
 - **Delete a workload**: To remove a failed or unwanted Pathways job, use XPK:
   ```bash
   xpk workload delete \
-      --workload $WORKLOAD \
-      --cluster $TPU_CLUSTER \
-      --project $PROJECT_ID
+      --workload ${WORKLOAD?} \
+      --cluster ${TPU_CLUSTER?} \
+      --project ${PROJECT_ID?}
   ```
   In case the job still lingers on, you can use
   `kubectl get pods` to obtain the name of the pod and then run: `kubectl delete pod <pod-name>`.
diff --git a/docs/tutorials/posttraining/sft.md b/docs/tutorials/posttraining/sft.md
index f1ed2ad267..cb3ff85baf 100644
--- a/docs/tutorials/posttraining/sft.md
+++ b/docs/tutorials/posttraining/sft.md
@@ -30,8 +30,8 @@ In this tutorial we use a single host TPU VM such as `v6e-8/v5p-8`. Let's get st
 # Create a virtual environment
 export VENV_NAME=<your virtual env name> # e.g., maxtext_venv
 pip install uv
-uv venv --python 3.12 --seed $VENV_NAME
-source $VENV_NAME/bin/activate
+uv venv --python 3.12 --seed ${VENV_NAME?}
+source ${VENV_NAME?}/bin/activate
 ```
 
 Run the following commands to get all the necessary installations.
@@ -89,17 +89,17 @@ Now you are ready to run SFT using the following command:
 
 ```sh
 python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \
-    run_name=${RUN_NAME} \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-    model_name=${PRE_TRAINED_MODEL} \
-    load_parameters_path=${PRE_TRAINED_MODEL_CKPT_PATH} \
-    hf_access_token=${HF_TOKEN} \
-    tokenizer_path=${PRE_TRAINED_MODEL_TOKENIZER} \
-    per_device_batch_size=${PER_DEVICE_BATCH_SIZE} \
-    steps=${STEPS} \
-    hf_path=${DATASET_NAME} \
-    train_split=${TRAIN_SPLIT} \
-    train_data_columns=${TRAIN_DATA_COLUMNS} \
+    run_name=${RUN_NAME?} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    model_name=${PRE_TRAINED_MODEL?} \
+    load_parameters_path=${PRE_TRAINED_MODEL_CKPT_PATH?} \
+    hf_access_token=${HF_TOKEN?} \
+    tokenizer_path=${PRE_TRAINED_MODEL_TOKENIZER?} \
+    per_device_batch_size=${PER_DEVICE_BATCH_SIZE?} \
+    steps=${STEPS?} \
+    hf_path=${DATASET_NAME?} \
+    train_split=${TRAIN_SPLIT?} \
+    train_data_columns=${TRAIN_DATA_COLUMNS?} \
     profiler=xplane
 ```
 
diff --git a/docs/tutorials/posttraining/sft_on_multi_host.md b/docs/tutorials/posttraining/sft_on_multi_host.md
index 063a452ec8..85e7964c86 100644
--- a/docs/tutorials/posttraining/sft_on_multi_host.md
+++ b/docs/tutorials/posttraining/sft_on_multi_host.md
@@ -61,7 +61,7 @@ bash dependencies/scripts/docker_build_dependency_image.sh WORKFLOW=post-trainin
 
 ```bash
 export DOCKER_IMAGE_NAME=<Docker Image Name>
-bash dependencies/scripts/docker_upload_runner.sh CLOUD_IMAGE_NAME=$DOCKER_IMAGE_NAME
+bash dependencies/scripts/docker_upload_runner.sh CLOUD_IMAGE_NAME=${DOCKER_IMAGE_NAME?}
 ```
 
 The `docker_upload_runner.sh` script uploads your Docker image to Artifact Registry.
@@ -86,7 +86,7 @@ export ZONE=<GKE Cluster Zone>
 export WORKLOAD_NAME=<Name of Workload> # e.g., sft-$(date +%s)
 export TPU_TYPE=<TPU Type> # e.g., v6e-256
 export TPU_SLICE=<number of slices>
-export DOCKER_IMAGE="gcr.io/${PROJECT}/${DOCKER_IMAGE_NAME}"
+export DOCKER_IMAGE="gcr.io/${PROJECT?}/${DOCKER_IMAGE_NAME?}"
 
 # -- MaxText Configuration --
 export OUTPUT_PATH=<GCS Path for Output/Logs> # e.g., gs://my-bucket/my-output-directory
@@ -136,14 +136,14 @@ This section provides the command to run SFT on a GKE cluster.
 
 ```bash
 xpk workload create \
---cluster=${CLUSTER_NAME} \
---project=${PROJECT} \
---zone=${ZONE} \
---docker-image=${DOCKER_IMAGE} \
---workload=${WORKLOAD_NAME} \
---tpu-type=${TPU_TYPE} \
---num-slices=${TPU_SLICE} \
---command "python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=$WORKLOAD_NAME base_output_directory=$OUTPUT_PATH model_name=$MODEL_NAME load_parameters_path=$MODEL_CHECKPOINT_PATH hf_access_token=$HF_TOKEN tokenizer_path=$TOKENIZER_PATH per_device_batch_size=1 steps=$STEPS profiler=xplane hf_path=$DATASET_NAME train_split=$TRAIN_SPLIT train_data_columns=$TRAIN_DATA_COLUMNS"
+--cluster=${CLUSTER_NAME?} \
+--project=${PROJECT?} \
+--zone=${ZONE?} \
+--docker-image=${DOCKER_IMAGE?} \
+--workload=${WORKLOAD_NAME?} \
+--tpu-type=${TPU_TYPE?} \
+--num-slices=${TPU_SLICE?} \
+--command "python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane hf_path=${DATASET_NAME?} train_split=${TRAIN_SPLIT?} train_data_columns=${TRAIN_DATA_COLUMNS?}"
 ```
 
 Once the fine-tuning is completed, you can access your model checkpoints at `$OUTPUT_PATH/$WORKLOAD_NAME/checkpoints`.
@@ -152,14 +152,14 @@ Once the fine-tuning is completed, you can access your model checkpoints at `$OU
 
 ```bash
 xpk workload create-pathways \
---cluster=${CLUSTER_NAME} \
---project=${PROJECT} \
---zone=${ZONE} \
---docker-image=${DOCKER_IMAGE} \
---workload=${WORKLOAD_NAME} \
---tpu-type=${TPU_TYPE} \
---num-slices=${TPU_SLICE} \
---command="JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE=1 python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=$WORKLOAD_NAME base_output_directory=$OUTPUT_PATH model_name=$MODEL_NAME load_parameters_path=$MODEL_CHECKPOINT_PATH hf_access_token=$HF_TOKEN tokenizer_path=$TOKENIZER_PATH per_device_batch_size=1 steps=$STEPS profiler=xplane checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False enable_single_controller=True"
+--cluster=${CLUSTER_NAME?} \
+--project=${PROJECT?} \
+--zone=${ZONE?} \
+--docker-image=${DOCKER_IMAGE?} \
+--workload=${WORKLOAD_NAME?} \
+--tpu-type=${TPU_TYPE?} \
+--num-slices=${TPU_SLICE?} \
+--command="JAX_PLATFORMS=proxy JAX_BACKEND_TARGET=grpc://127.0.0.1:29000 ENABLE_PATHWAYS_PERSISTENCE=1 python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml run_name=${WORKLOAD_NAME?} base_output_directory=${OUTPUT_PATH?} model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} hf_access_token=${HF_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} per_device_batch_size=1 steps=${STEPS?} profiler=xplane checkpoint_storage_use_zarr3=False checkpoint_storage_use_ocdbt=False enable_single_controller=True"
 ```
 
 Once the fine-tuning is completed, you can access your model checkpoints at `$OUTPUT_PATH/$WORKLOAD_NAME/checkpoints`.
diff --git a/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py b/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py
index a9b160ad1d..888cf4d2d1 100644
--- a/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py
+++ b/src/maxtext/checkpoint_conversion/standalone_scripts/convert_gpt3_ckpt_from_paxml.py
@@ -19,8 +19,8 @@
 python3 -m maxtext.checkpoint_conversion.standalone_scripts.convert_gpt3_ckpt_from_paxml \
   --paxml-ckpt-path=gs://maxtext-gpt3/ckpt_test/paxml/checkpoints/checkpoint_00000000/state \
   --maxtext-model-name=gpt3-52k \
-  --run-name=$RUN_NAME \
-  --base-output-directory=$BASE_OUTPUT_DIR
+  --run-name=${RUN_NAME?} \
+  --base-output-directory=${BASE_OUTPUT_DIR?}
 
 True cmd for gpt3-175b:
 
@@ -30,8 +30,8 @@
 python3 -m maxtext.checkpoint_conversion.standalone_scripts.convert_gpt3_ckpt_from_paxml \
   --paxml-ckpt-path=gs://mlperf-llm-public2/gpt3_spmd1x64x24_tpuv4-3072_v84_20221101/checkpoints/checkpoint_00004000 \
   --maxtext-model-name=gpt3-175b \
-  --run-name=$RUN_NAME \
-  --base-output-directory=$BASE_OUTPUT_DIR
+  --run-name=${RUN_NAME?} \
+  --base-output-directory=${BASE_OUTPUT_DIR?}
 """
 
 import argparse
diff --git a/src/maxtext/checkpoint_conversion/to_maxtext.py b/src/maxtext/checkpoint_conversion/to_maxtext.py
index 3292b33565..06f95ed85a 100644
--- a/src/maxtext/checkpoint_conversion/to_maxtext.py
+++ b/src/maxtext/checkpoint_conversion/to_maxtext.py
@@ -43,7 +43,7 @@
     /usr/bin/time -v python src/MaxText/checkpoint_conversion/to_maxtext.py \
     maxtext/configs/base.yml model_name="gemma2-2b" \
     base_output_directory="/path/to/your/output/directory" \
-    hf_access_token=$HF_TOKEN hardware=cpu skip_jax_distributed_system=True \
+    hf_access_token=${HF_TOKEN?} hardware=cpu skip_jax_distributed_system=True \
     scan_layers=False
 
   For models with scanned layers (e.g., some custom architectures), you might
@@ -54,7 +54,7 @@
    /usr/bin/time -v python src/MaxText/checkpoint_conversion/to_maxtext.py \
     maxtext/configs/base.yml model_name="llama3.1-70b" \
     base_output_directory="gs://my-bucket/maxtext-checkpoints" \
-    hf_access_token=$HF_TOKEN hardware=cpu skip_jax_distributed_system=True \
+    hf_access_token=${HF_TOKEN?} hardware=cpu skip_jax_distributed_system=True \
     --lazy_load_tensors=True
 """
 
diff --git a/src/maxtext/configs/README.md b/src/maxtext/configs/README.md
index 0f681b5cea..ec00acf759 100644
--- a/src/maxtext/configs/README.md
+++ b/src/maxtext/configs/README.md
@@ -31,17 +31,17 @@ These configurations do 3 things:
 
      Create a network with an MTU of 8896 bytes and set up firewall rules. (Creating a network requires `compute.networks.create` permission in your project)
      ```
-     gcloud compute networks create mtu9k --mtu=8896 --project=${PROJECT} --subnet-mode=auto --bgp-routing-mode=regional
+     gcloud compute networks create mtu9k --mtu=8896 --project=${PROJECT?} --subnet-mode=auto --bgp-routing-mode=regional
      ```
      ```
-     gcloud compute firewall-rules create mtu9kfw --network mtu9k --allow tcp,icmp,udp --project=${PROJECT}
+     gcloud compute firewall-rules create mtu9kfw --network mtu9k --allow tcp,icmp,udp --project=${PROJECT?}
      ```
 
      When you create your TPUs, you need to indicate they should be part of this network.
 
      Here is an example of a queued-resources request on GCE using the `--network` flag (`--network=mtu9k`).
      ```
-     gcloud alpha compute tpus queued-resources create ${QR_ID} --node-prefix=${TPU_NAME} --node-count=${NUM_SLICES} --accelerator_type=${ACCELERATOR_TYPE} --runtime_version=${RUNTIME_VERSION} --network=mtu9k --project=${PROJECT} --zone=${ZONE}
+     gcloud alpha compute tpus queued-resources create ${QR_ID?} --node-prefix=${TPU_NAME?} --node-count=${NUM_SLICES?} --accelerator_type=${ACCELERATOR_TYPE?} --runtime_version=${RUNTIME_VERSION?} --network=mtu9k --project=${PROJECT?} --zone=${ZONE?}
      ```
      Note: If you want to use only one slice, you need to replace node-prefix with node-id, and remove node-count.
 
@@ -49,7 +49,7 @@ These configurations do 3 things:
      ```
      export CLUSTER_ARGUMENTS="--network=mtu9k --subnetwork=mtu9k"
 
-     python3 xpk/xpk.py cluster create --cluster ${YOUR_CLUSTER_NAME} --tpu-type ${ACCELERATOR_TYPE} --num-slices ${NUM_SLICES} --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
+     python3 xpk/xpk.py cluster create --cluster ${YOUR_CLUSTER_NAME?} --tpu-type ${ACCELERATOR_TYPE?} --num-slices ${NUM_SLICES?} --custom-cluster-arguments="${CLUSTER_ARGUMENTS?}"
      ```
 
 ### Run model config scripts on TPUs
@@ -59,19 +59,19 @@ These configurations do 3 things:
 
     Running with `multihost_runner.py` on GCE:
     ```
-    python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX} --COMMAND="bash setup.sh && bash src/maxtext/configs/tpu/v5p/128b.sh RUN_NAME=${YOUR_RUN_NAME} OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH} DATASET_PATH=${MAXTEXT_DATASET_PATH} PLATFORM=gce"
+    python3 multihost_runner.py --TPU_PREFIX=${TPU_PREFIX?} --COMMAND="bash setup.sh && bash src/maxtext/configs/tpu/v5p/128b.sh RUN_NAME=${YOUR_RUN_NAME?} OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH?} DATASET_PATH=${MAXTEXT_DATASET_PATH?} PLATFORM=gce"
     ```
 
     Running with `multihost_job.py` on GCE:
     ```
-    python3 multihost_job.py --NUM_SLICES=${NUM_SLICES} --TPU_TYPE=${ACCELERATOR_TYPE} --VERSION=${RUNTIME_VERSION} --RUN_NAME=${RUN_NAME} --BUCKET_NAME=${GCS_BUCKET_NAME} --COMMAND="bash setup.sh && bash src/maxtext/configs/tpu/v5p/128b.sh RUN_NAME=${YOUR_RUN_NAME} OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH} DATASET_PATH=${MAXTEXT_DATASET_PATH} PLATFORM=gce"
+    python3 multihost_job.py --NUM_SLICES=${NUM_SLICES?} --TPU_TYPE=${ACCELERATOR_TYPE?} --VERSION=${RUNTIME_VERSION?} --RUN_NAME=${RUN_NAME?} --BUCKET_NAME=${GCS_BUCKET_NAME?} --COMMAND="bash setup.sh && bash src/maxtext/configs/tpu/v5p/128b.sh RUN_NAME=${YOUR_RUN_NAME?} OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH?} DATASET_PATH=${MAXTEXT_DATASET_PATH?} PLATFORM=gce"
 
     # Add --CQR_EXTRA_ARGS="--network=mtu9k" to the command if you would like to use the custom MTU network.
     ```
 
     Running with `XPK` on GKE:
     ```
-    xpk workload create --cluster ${YOUR_CLUSTER_NAME} --docker-image gcr.io/${PROJECT}/${YOUR_IMAGE_NAME} --workload ${YOUR_RUN_NAME} --tpu-type=${ACCELERATOR_TYPE} --num-slices=${NUM_SLICES} --command "bash src/maxtext/configs/tpu/v5p/128b.sh OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH} DATASET_PATH=${MAXTEXT_DATASET_PATH} PLATFORM=gke"
+    xpk workload create --cluster ${YOUR_CLUSTER_NAME?} --docker-image gcr.io/${PROJECT?}/${YOUR_IMAGE_NAME?} --workload ${YOUR_RUN_NAME?} --tpu-type=${ACCELERATOR_TYPE?} --num-slices=${NUM_SLICES?} --command "bash src/maxtext/configs/tpu/v5p/128b.sh OUTPUT_PATH=${MAXTEXT_OUTPUT_PATH?} DATASET_PATH=${MAXTEXT_DATASET_PATH?} PLATFORM=gke"
     ```
 
     Note: When running these scripts, be sure to specify the `PLATFORM` flag with the correct platform you are running on `"gce"` or `"gke"`.
diff --git a/src/maxtext/examples/sft_train_and_evaluate.py b/src/maxtext/examples/sft_train_and_evaluate.py
index 5bad685e79..779c8b9c17 100644
--- a/src/maxtext/examples/sft_train_and_evaluate.py
+++ b/src/maxtext/examples/sft_train_and_evaluate.py
@@ -25,8 +25,8 @@
 # Create a virtual environment
 export VENV_NAME=<your virtual env name> # e.g., maxtext_venv
 pip install uv
-uv venv --python 3.12 --seed $VENV_NAME
-source $VENV_NAME/bin/activate
+uv venv --python 3.12 --seed ${VENV_NAME?}
+source ${VENV_NAME?}/bin/activate
 
 # Run the following commands to get all the necessary installations.
 
@@ -44,23 +44,24 @@
 export HF_ACCESS_TOKEN=<Hugging Face access token>
 
 python3 -m maxtext.examples.sft_train_and_evaluate maxtext/configs/post_train/sft.yml \
-  run_name=$RUN_NAME base_output_directory=$OUTPUT_PATH \
-  model_name=$MODEL_NAME load_parameters_path=$MODEL_CHECKPOINT_PATH \
-  hf_access_token=$HF_ACCESS_TOKEN tokenizer_path=$TOKENIZER_PATH
+  run_name=${RUN_NAME?} base_output_directory=${OUTPUT_PATH?} \
+  model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} \
+  hf_access_token=${HF_ACCESS_TOKEN?} tokenizer_path=${TOKENIZER_PATH?}
 ```
 
 ## Example command to run on multi-host TPUs using McJAX:
 ```
 # Build & upload docker image
 export DOCKER_IMAGE_NAME=${USER}_runner
-bash docker_build_dependency_image.sh MODE=post-training && bash docker_upload_runner.sh CLOUD_IMAGE_NAME=$DOCKER_IMAGE_NAME
+bash docker_build_dependency_image.sh MODE=post-training && \
+  bash docker_upload_runner.sh CLOUD_IMAGE_NAME=${DOCKER_IMAGE_NAME?}
 
 # Environment configurations
 export PROJECT=<Google Cloud Project ID>
 export CLUSTER_NAME=<Mame of GKE cluster>
 export ZONE=<CGKE cluster zone>
 export TPU_TYPE=<TPU type>
-export DOCKER_IMAGE="gcr.io/${PROJECT}/${DOCKER_IMAGE_NAME}
+export DOCKER_IMAGE="gcr.io/${PROJECT?}/${DOCKER_IMAGE_NAME?}"
 export RUN_NAME=$(date +%Y-%m-%d-%H-%M-%S)
 export OUTPUT_PATH=<GCS Bucket Path for output/logs>
 export MODEL_NAME=llama3.1-8b
@@ -70,15 +71,16 @@
 
 # Run workload via XPK
 xpk workload create \
---cluster ${CLUSTER_NAME} \
---docker-image ${DOCKER_IMAGE} \
---workload=sft-${RUN_NAME} \
---tpu-type ${TPU_TYPE} --num-slices=1 --zone=${ZONE} \
---project=${PROJECT} \
---command "HF_TOKEN=$HF_ACCESS_TOKEN python3 -m maxtext.examples.sft_train_and_evaluate maxtext/configs/post_train/sft.yml \
-  run_name=$RUN_NAME base_output_directory=$OUTPUT_PATH \
-    model_name=$MODEL_NAME load_parameters_path=$MODEL_CHECKPOINT_PATH \
-      hf_access_token=$HF_ACCESS_TOKEN tokenizer_path=$TOKENIZER_PATH"
+--cluster ${CLUSTER_NAME?} \
+--docker-image ${DOCKER_IMAGE?} \
+--workload=sft-${RUN_NAME?} \
+--tpu-type ${TPU_TYPE?} --num-slices=1 --zone=${ZONE?} \
+--project=${PROJECT?} \
+--command "HF_TOKEN=${HF_ACCESS_TOKEN?} \
+  python3 -m maxtext.examples.sft_train_and_evaluate maxtext/configs/post_train/sft.yml \
+  run_name=${RUN_NAME?} base_output_directory=${OUTPUT_PATH?} \
+    model_name=${MODEL_NAME?} load_parameters_path=${MODEL_CHECKPOINT_PATH?} \
+      hf_access_token=${HF_ACCESS_TOKEN?} tokenizer_path=${TOKENIZER_PATH?}"
 ```
 """
 
diff --git a/src/maxtext/inference/jetstream_pathways/README.md b/src/maxtext/inference/jetstream_pathways/README.md
index 5a63b89f13..99cd718815 100644
--- a/src/maxtext/inference/jetstream_pathways/README.md
+++ b/src/maxtext/inference/jetstream_pathways/README.md
@@ -4,8 +4,8 @@ These instructions are to build the MaxText + JetStream + Pathways Server image,
 
 ```
 docker build -t jetstream-pathways .
-docker tag jetstream-pathways us-docker.pkg.dev/${PROJECT_ID}/jetstream/jetstream-pathways:latest
-docker push us-docker.pkg.dev/${PROJECT_ID}/jetstream/jetstream-pathways:latest
+docker tag jetstream-pathways us-docker.pkg.dev/${PROJECT_ID?}/jetstream/jetstream-pathways:latest
+docker push us-docker.pkg.dev/${PROJECT_ID?}/jetstream/jetstream-pathways:latest
 ```
 
 If you would like to change the version of MaxText or JetStream the image is built off of, change the `MAXTEXT_VERSION` / `JETSTREAM_VERSION` environment variable:
diff --git a/src/maxtext/inference/maxengine/maxengine_server_deployment/README.md b/src/maxtext/inference/maxengine/maxengine_server_deployment/README.md
index d413bba995..3351610f60 100644
--- a/src/maxtext/inference/maxengine/maxengine_server_deployment/README.md
+++ b/src/maxtext/inference/maxengine/maxengine_server_deployment/README.md
@@ -4,8 +4,8 @@ These instructions are to build the Maxengine Server image, which calls an entry
 
 ```
 docker build -t maxengine-server .
-docker tag maxengine-server us-docker.pkg.dev/${PROJECT_ID}/jetstream/maxengine-server:latest
-docker push us-docker.pkg.dev/${PROJECT_ID}/jetstream/maxengine-server:latest
+docker tag maxengine-server us-docker.pkg.dev/${PROJECT_ID?}/jetstream/maxengine-server:latest
+docker push us-docker.pkg.dev/${PROJECT_ID?}/jetstream/maxengine-server:latest
 ```
 
 If you would like to change the version of MaxText or JetStream the image is built off of, change the `MAXTEXT_VERSION` / `JETSTREAM_VERSION` environment variable:
diff --git a/src/maxtext/inference/mlperf/README.md b/src/maxtext/inference/mlperf/README.md
index 05404992f0..e6fc9d1d80 100644
--- a/src/maxtext/inference/mlperf/README.md
+++ b/src/maxtext/inference/mlperf/README.md
@@ -35,8 +35,8 @@ Please try running `conda install -c conda-forge gcc_linux-64 gxx_linux-64 libst
 
 ```sh
 export DATA_DISK_DIR=~/loadgen_run_data
-mkdir -p ${DATA_DISK_DIR}
-cd ${DATA_DISK_DIR}
+mkdir -p ${DATA_DISK_DIR?}
+cd ${DATA_DISK_DIR?}
 ```
 
 #### LLama2-70b:
@@ -100,7 +100,7 @@ export SAVE_QUANT_PARAMS_PATH=gs://${USER}-bkt/quantized/llama2-70b-chat
 # other tokenizers under src/maxtext/assets/ directory.
 export TOKENIZER_PATH="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets}}"'/tokenizer.llama2'
 cd maxtext && \
-python3 -m maxtext.inference.decode src/maxtext/configs/base.yml tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${LOAD_PARAMS_PATH} max_prefill_predict_length=1024 max_target_length=2048 model_name=llama2-70b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=11 attention=dot_product quantization=int8 save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH}
+python3 -m maxtext.inference.decode src/maxtext/configs/base.yml tokenizer_path=${TOKENIZER_PATH?} load_parameters_path=${LOAD_PARAMS_PATH?} max_prefill_predict_length=1024 max_target_length=2048 model_name=llama2-70b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=11 attention=dot_product quantization=int8 save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH?}
 ```
 
 Your checkpoint is generated at `$SAVE_QUANT_PARAMS_PATH`. This is used to set `load_parameters_path` param below in `MAXENGINE_ARGS` env variable.
@@ -125,7 +125,7 @@ export MODEL_SIZE=llama3.1-405b
 export QUANTIZE_TYPE=int8
 
 cd maxtext && \
-python3 -m maxtext.checkpoint_conversion.load_and_quantize_checkpoint src/maxtext/configs/base.yml tokenizer_path=${TOKENIZER} load_parameters_path=${LOAD_PARAMS_PATH} max_prefill_predict_length=1024 max_target_length=2048 model_name=${MODEL_SIZE} ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=1 attention=dot_product quantization=${QUANTIZE_TYPE} save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH} async_checkpointing=false
+python3 -m maxtext.checkpoint_conversion.load_and_quantize_checkpoint src/maxtext/configs/base.yml tokenizer_path=${TOKENIZER?} load_parameters_path=${LOAD_PARAMS_PATH?} max_prefill_predict_length=1024 max_target_length=2048 model_name=${MODEL_SIZE?} ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=1 attention=dot_product quantization=${QUANTIZE_TYPE?} save_quantized_params_path=${SAVE_QUANT_PARAMS_PATH?} async_checkpointing=false
 ```
 
 The quantized checkpoint is saved at `${SAVE_QUANT_PARAMS_PATH}`
@@ -133,7 +133,7 @@ The quantized checkpoint is saved at `${SAVE_QUANT_PARAMS_PATH}`
 ### HuggingFace login
 ```
 export HUGGING_FACE_TOKEN=<your_hugging_face_token>
-huggingface-cli login --token $HUGGING_FACE_TOKEN
+huggingface-cli login --token ${HUGGING_FACE_TOKEN?}
 ```
 
 ### Run Offline Benchmarks
@@ -172,7 +172,7 @@ bash benchmarks_llama2-70b-trillium_2x4.sh -b=all
 #### Mixtral-8x7b:
 ```
 export PREFILL_LENS_AND_PER_DEVICE_BATCH_SIZES="256,144|512,72|2048,18"
-export MAXENGINE_ARGS="model_name=mixtral-8x7b tokenizer_path=${TOKENIZER_PATH}  quantization=int8 quantize_kvcache=True load_parameters_path=${SAVE_QUANT_PARAMS_PATH} checkpoint_is_quantized=True megablox=False sparse_matmul=False capacity_factor=1 model_call_mode=inference compute_axis_order=0,2,1,3 ar_cache_axis_order=0,2,1,3"
+export MAXENGINE_ARGS="model_name=mixtral-8x7b tokenizer_path=${TOKENIZER_PATH?}  quantization=int8 quantize_kvcache=True load_parameters_path=${SAVE_QUANT_PARAMS_PATH?} checkpoint_is_quantized=True megablox=False sparse_matmul=False capacity_factor=1 model_call_mode=inference compute_axis_order=0,2,1,3 ar_cache_axis_order=0,2,1,3"
 ```
 
 ##### Test Run
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
index 9dea72446d..b0ac010be3 100644
--- a/src/maxtext/trainers/post_train/rl/train_rl.py
+++ b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -27,18 +27,18 @@
   model_name=llama3.1-8b \
   tokenizer_path=meta-llama/Llama-3.1-8B-Instruct \
   load_parameters_path=gs://path/to/checkpoint/0/items \
-  run_name=$WORKLOAD \
-  base_output_directory=$OUTPUT_PATH \
-  hf_access_token=$HF_TOKEN
+  run_name=${WORKLOAD?} \
+  base_output_directory=${OUTPUT_PATH?} \
+  hf_access_token=${HF_TOKEN?}
 
 # GSPO on Llama3.1-70B-Instruct
 python3 -m src.maxtext.trainers.post_train.rl.train_rl src/maxtext/configs/post_train/rl.yml \
   model_name=llama3.1-70b \
   tokenizer_path=meta-llama/Llama-3.1-70B-Instruct \
   load_parameters_path=gs://path/to/checkpoint/0/items \
-  run_name=$WORKLOAD \
-  base_output_directory=$OUTPUT_PATH \
-  hf_access_token=$HF_TOKEN \
+  run_name=${WORKLOAD?} \
+  base_output_directory=${OUTPUT_PATH?} \
+  hf_access_token=${HF_TOKEN?} \
   loss_algo=gspo-token
 
 """
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
index 19b2ca68b4..6a34f87217 100644
--- a/src/maxtext/trainers/post_train/sft/train_sft.py
+++ b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -20,17 +20,17 @@
 Example command:
 Training & Evaluation:
   python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \
-    run_name=$RUN_NAME base_output_directory=$BASE_OUTPUT_DIRECTORY \
-    model_name=$MODEL_NAME load_parameters_path=$CHECKPOINT_PATH \
-    hf_access_token=$HF_ACCESS_TOKEN tokenizer_path=$TOKENIZER_PATH \
+    run_name=${RUN_NAME?} base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    model_name=${MODEL_NAME?} load_parameters_path=${CHECKPOINT_PATH?} \
+    hf_access_token=${HF_ACCESS_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} \
     per_device_batch_size=1 max_target_length=1024 \
     eval_interval=2 eval_steps=2 steps=10 profiler=xplane weight_dtype=bfloat16
 
 Training:
   python3 -m maxtext.trainers.post_train.sft.train_sft src/maxtext/configs/post_train/sft.yml \
-    run_name=$RUN_NAME base_output_directory=$BASE_OUTPUT_DIRECTORY \
-    model_name=$MODEL_NAME load_parameters_path=$CHECKPOINT_PATH \
-    hf_access_token=$HF_ACCESS_TOKEN tokenizer_path=$TOKENIZER_PATH \
+    run_name=${RUN_NAME?} base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    model_name=${MODEL_NAME?} load_parameters_path=${CHECKPOINT_PATH?} \
+    hf_access_token=${HF_ACCESS_TOKEN?} tokenizer_path=${TOKENIZER_PATH?} \
     per_device_batch_size=1 max_target_length=1024 \
     eval_interval=-1 steps=10 profiler=xplane weight_dtype=bfloat16
 """
diff --git a/src/maxtext/utils/layerwise_quantization.py b/src/maxtext/utils/layerwise_quantization.py
index 93f98cfabf..4be05ff7e1 100644
--- a/src/maxtext/utils/layerwise_quantization.py
+++ b/src/maxtext/utils/layerwise_quantization.py
@@ -20,12 +20,12 @@
 Example cmd:
 
 python3 -m MaxText.layerwise_quantization  src/maxtext/configs/base.yml \
-  tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${LOAD_PARAMS_PATH} \
+  tokenizer_path=${TOKENIZER_PATH?} load_parameters_path=${LOAD_PARAMS_PATH?} \
   model_name=deepseek2-16b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 \
   ici_tensor_parallelism=-1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=1 \
   attention=dot_product quantization=int8 async_checkpointing=false enable_single_controller=true \
   tokenizer_type=huggingface megablox=false sparse_matmul=false \
-  save_quantized_params_path=${SAVE_PARAMS_PATH} checkpoint_storage_use_ocdbt=False \
+  save_quantized_params_path=${SAVE_PARAMS_PATH?} checkpoint_storage_use_ocdbt=False \
   checkpoint_storage_use_zarr3=False
 
 """
diff --git a/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md b/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md
index 53c158584e..13c29955ca 100644
--- a/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md
+++ b/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md
@@ -35,7 +35,7 @@ You can train from scratch to generate a new checkpoint. One example command to
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
     run_name=matmul_pre_training \
     per_device_batch_size=4 \
     enable_checkpointing=false \
@@ -69,9 +69,9 @@ One example command to run general finetuning with V3 on v5p-256.
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-    dataset_path=${DATASET_PATH} \
-    load_parameters_path=${CONVERTED_CHECKPOINT} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    dataset_path=${DATASET_PATH?} \
+    load_parameters_path=${CONVERTED_CHECKPOINT?} \
     run_name=matmul_fine_tuning \
     per_device_batch_size=4 \
     model_name=deepseek3-671b \
@@ -115,8 +115,8 @@ One example command to run supervised finetuning with V3 on v5p-256. Supervised
 
 ```sh
 python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/configs/post_train/sft.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-    load_parameters_path=${CONVERTED_CHECKPOINT} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    load_parameters_path=${CONVERTED_CHECKPOINT?} \
     run_name=matmul_supervised_fine_tuning \
     per_device_batch_size=4 \
     model_name=deepseek3-671b \
@@ -141,8 +141,8 @@ One example command to run decoding with V3 on v5p-256 with unscanned checkpoint
 
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-    load_parameters_path=${CONVERTED_CHECKPOINT} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    load_parameters_path=${CONVERTED_CHECKPOINT?} \
     run_name=decode \
     per_device_batch_size=1 \
     enable_checkpointing=false \
@@ -191,7 +191,7 @@ python3 -m tests.utils.forward_pass_logit_checker \
     src/maxtext/configs/base.yml \
     tokenizer_type=huggingface \
     tokenizer_path=deepseek-ai/DeepSeek-V2-Lite \
-    load_parameters_path=${CONVERTED_CHECKPOINT} \
+    load_parameters_path=${CONVERTED_CHECKPOINT?} \
     run_name=forward_pass_test_deepseek2-16b \
     per_device_batch_size=1 \
     model_name=deepseek2-16b \
diff --git a/tests/end_to_end/tpu/gemma3/Run_Gemma3.md b/tests/end_to_end/tpu/gemma3/Run_Gemma3.md
index 7edc07f146..f73068a3ea 100644
--- a/tests/end_to_end/tpu/gemma3/Run_Gemma3.md
+++ b/tests/end_to_end/tpu/gemma3/Run_Gemma3.md
@@ -25,7 +25,7 @@ We provide examples for checkpoint conversion and decoding/training/finetuning G
 You can train from scratch to generate a new checkpoint. One example command to run pretraining Gemma3-4B model is as follows:
 
 ```sh
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml model_name=gemma3-4b base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 per_device_batch_size=1 run_name=runner_pretrain_gemma3_4b steps=10 enable_checkpointing=false sharding_tolerance=0.03
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml model_name=gemma3-4b base_output_directory=${BASE_OUTPUT_DIRECTORY?} dataset_path=${DATASET_PATH?} tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 per_device_batch_size=1 run_name=runner_pretrain_gemma3_4b steps=10 enable_checkpointing=false sharding_tolerance=0.03
 ```
 
 ## Checkpoint Conversion
@@ -35,12 +35,12 @@ To obtain the Gemma3 model weights, follow the instructions provided on [Kaggle]
 After the conversion, you will have a MaxText compatible checkpoint which allows you to fine-tune it with different datasets. One example command to fine-tune a Gemma3-4B model is as follows:
 
 ```
-python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml model_name=gemma3-4b base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} load_parameters_path=${CONVERTED_CHECKPOINT} tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 per_device_batch_size=1 run_name=runner_finetune_gemma3_4b steps=10 enable_checkpointing=true sharding_tolerance=0.03
+python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml model_name=gemma3-4b base_output_directory=${BASE_OUTPUT_DIRECTORY?} dataset_path=${DATASET_PATH?} load_parameters_path=${CONVERTED_CHECKPOINT?} tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 per_device_batch_size=1 run_name=runner_finetune_gemma3_4b steps=10 enable_checkpointing=true sharding_tolerance=0.03
 ```
 
 ## Decoding
 One example to use a converted checkpoint to decode with prompt "I love to":
 
 ```
-python3 -m maxtext.inference.decode src/maxtext/configs/base.yml model_name=gemma3-4b tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 load_parameters_path=${CONVERTED_CHECKPOINT} per_device_batch_size=1 run_name=runner_decode_gemma3_4b max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false prompt="I love to"
+python3 -m maxtext.inference.decode src/maxtext/configs/base.yml model_name=gemma3-4b tokenizer_path="${MAXTEXT_ASSETS_ROOT:-${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/maxtext/assets/tokenizers}}"/tokenizer.gemma3 load_parameters_path=${CONVERTED_CHECKPOINT?} per_device_batch_size=1 run_name=runner_decode_gemma3_4b max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false prompt="I love to"
 ```
\ No newline at end of file
diff --git a/tests/end_to_end/tpu/gpt_oss/run_gpt_oss.md b/tests/end_to_end/tpu/gpt_oss/run_gpt_oss.md
index a7c05c6a30..19152fff5f 100644
--- a/tests/end_to_end/tpu/gpt_oss/run_gpt_oss.md
+++ b/tests/end_to_end/tpu/gpt_oss/run_gpt_oss.md
@@ -59,7 +59,7 @@ You can train from scratch to generate a new checkpoint. One example command to
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_PATH} \
+    base_output_directory=${BASE_OUTPUT_PATH?} \
     run_name=megablox_pre_training \
     model_name=gpt-oss-20b \
     tokenizer_type=huggingface \
@@ -85,15 +85,15 @@ One example command to run general finetuning with gpt-oss-20b on v5p-8.
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_PATH} \
+    base_output_directory=${BASE_OUTPUT_PATH?} \
     run_name=megablox_fine_tuning \
     model_name=gpt-oss-20b \
     tokenizer_type=huggingface \
     tokenizer_path=openai/gpt-oss-20b \
-    dataset_path=${DATASET_PATH} \
+    dataset_path=${DATASET_PATH?} \
     enable_checkpointing=true \
     async_checkpointing=false \
-    load_parameters_path=${SCANNED_CKPT_PATH} \
+    load_parameters_path=${SCANNED_CKPT_PATH?} \
     scan_layers=True \
     attention=flash \
     sparse_matmul=True \
@@ -111,7 +111,7 @@ One example command to run supervised finetuning with gpt-oss-20b on v5p-8. Supe
 
 ```sh
 python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/configs/post_train/sft.yml \
-    base_output_directory=${BASE_OUTPUT_PATH} \
+    base_output_directory=${BASE_OUTPUT_PATH?} \
     run_name=megablox_supervised_fine_tuning \
     model_name=gpt-oss-20b \
     tokenizer_type=huggingface \
@@ -119,7 +119,7 @@ python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/conf
     dataset_type=hf \
     enable_checkpointing=true \
     async_checkpointing=false \
-    load_parameters_path=${SCANNED_CKPT_PATH} \
+    load_parameters_path=${SCANNED_CKPT_PATH?} \
     scan_layers=True \
     attention=flash \
     sparse_matmul=True \
@@ -138,13 +138,13 @@ One example command to run decoding with gpt-oss-20b on v5p-8 with unscanned che
 
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_PATH} \
+    base_output_directory=${BASE_OUTPUT_PATH?} \
     run_name=decode \
     model_name=gpt-oss-20b \
     tokenizer_type=huggingface \
     tokenizer_path=openai/gpt-oss-20b \
-    hf_access_token=${HF_TOKEN} \
-    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    hf_access_token=${HF_TOKEN?} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH?} \
     scan_layers=False \
     attention=dot_product \
     sparse_matmul=True \
@@ -183,10 +183,10 @@ Run command below to compare logits between HuggingFace and MaxText.
 ```sh
 python3 -m tests.utils.forward_pass_logit_checker \
     src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_PATH} \
+    base_output_directory=${BASE_OUTPUT_PATH?} \
     run_name=forward_logits_check \
     model_name=gpt-oss-20b \
-    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH?} \
     scan_layers=false \
     attention=dot_product \
     sparse_matmul=True \
diff --git a/tests/end_to_end/tpu/llama4/Run_Llama4.md b/tests/end_to_end/tpu/llama4/Run_Llama4.md
index ada83d29c1..7571389fe0 100644
--- a/tests/end_to_end/tpu/llama4/Run_Llama4.md
+++ b/tests/end_to_end/tpu/llama4/Run_Llama4.md
@@ -44,7 +44,7 @@ You can train from scratch to generate a new checkpoint. One example command to
 
 ```sh
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
     run_name=matmul_pre_training \
     per_device_batch_size=1 \
     enable_checkpointing=false \
@@ -66,12 +66,12 @@ In order to run an example decoding with Llama4 Scout, you can use a command suc
 
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
     run_name=decode \
     model_name=llama4-17b-16e \
     tokenizer_path="meta-llama/Llama-4-Scout-17B-16E" \
-    hf_access_token=${HF_TOKEN} \
-    load_parameters_path=${UNSCANNED_CKPT_PATH} \
+    hf_access_token=${HF_TOKEN?} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH?} \
     scan_layers=false \
     attention=dot_product \
     sparse_matmul=false \
diff --git a/tests/end_to_end/tpu/qwen/moe/run_qwen_moe.md b/tests/end_to_end/tpu/qwen/moe/run_qwen_moe.md
index cc82ae8589..87f149f6d7 100644
--- a/tests/end_to_end/tpu/qwen/moe/run_qwen_moe.md
+++ b/tests/end_to_end/tpu/qwen/moe/run_qwen_moe.md
@@ -45,8 +45,8 @@ After converting the checkpoint, you can use it for fine-tuning or start a pre-t
 
 ```
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml\
-    base_output_directory=${BASE_OUTPUT_DIRECTORY}\
-    dataset_path=${DATASET_PATH}\
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?}\
+    dataset_path=${DATASET_PATH?}\
     load_parameters_path=gs://your-gcs-bucket/qwen3_maxtext_ckpt/0/items\
     run_name=qwen3_finetuning\
     per_device_batch_size=1\
diff --git a/tests/end_to_end/tpu/qwen/next/run_qwen3_next.md b/tests/end_to_end/tpu/qwen/next/run_qwen3_next.md
index 25898cc3b1..0cd3d4bbf8 100644
--- a/tests/end_to_end/tpu/qwen/next/run_qwen3_next.md
+++ b/tests/end_to_end/tpu/qwen/next/run_qwen3_next.md
@@ -37,8 +37,8 @@ After converting the checkpoint, you can use it for fine-tuning or start a pre-t
 
 ```
 python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
-    base_output_directory=${BASE_OUTPUT_DIRECTORY} \
-    dataset_path=${DATASET_PATH} \
+    base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
+    dataset_path=${DATASET_PATH?} \
     load_parameters_path=gs://your-gcs-bucket/qwen3_next_maxtext_ckpt/0/items \
     run_name=qwen3_next_finetuning \
     per_device_batch_size=1 \
diff --git a/tools/data_generation/generate_distillation_data_vllm.py b/tools/data_generation/generate_distillation_data_vllm.py
index b4df313481..e4617d56c5 100644
--- a/tools/data_generation/generate_distillation_data_vllm.py
+++ b/tools/data_generation/generate_distillation_data_vllm.py
@@ -24,11 +24,11 @@
       --dataset-path HuggingFaceH4/ultrachat_200k \
       --data-split train_sft \
       --data-columns messages \
-      --hf-access-token $HF_TOKEN \
-      --teacher-model ${BASE_DIRECTORY}/qwen3-32b \
+      --hf-access-token ${HF_TOKEN?} \
+      --teacher-model ${BASE_DIRECTORY?}/qwen3-32b \
       --use-chat-template \
       --num-prompts 5120 \
-      --output-file ${BASE_DIRECTORY}/datasets/distillation_data.parquet
+      --output-file ${BASE_DIRECTORY?}/datasets/distillation_data.parquet
 
 This processes 5120 prompts, generating the specified number of samples per prompt.
 Some prompts may be filtered out if prompt tokens are longer than `max-prefill-length`.