diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml index 9b967bc384..9433efdf3e 100644 --- a/.github/workflows/UploadDockerImages.yml +++ b/.github/workflows/UploadDockerImages.yml @@ -33,18 +33,35 @@ on: - tpu - gpu +permissions: + contents: read + jobs: - build: - name: Build ${{ matrix.device }}-${{ matrix.build_mode }} Image - runs-on: linux-x86-n2-16-buildkit - container: google/cloud-sdk:524.0.0 + setup: + runs-on: ubuntu-latest + outputs: + maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }} + image_date: ${{ steps.vars.outputs.image_date }} + steps: + - name: Checkout MaxText + uses: actions/checkout@v5 + + - name: Get metadata + id: vars + run: | + # MaxText SHA + echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + + # Image date + echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT - # Use Github Actions matrix to run image builds in parallel + tpu-pre-training: + name: ${{ matrix.image_name }} + needs: setup strategy: fail-fast: false matrix: include: - # TPU Image Builds - device: tpu build_mode: stable image_name: maxtext_jax_stable @@ -53,7 +70,47 @@ jobs: build_mode: nightly image_name: maxtext_jax_nightly dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile - # GPU Image Builds + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + image_name: ${{ matrix.image_name }} + device: ${{ matrix.device }} + build_mode: ${{ matrix.build_mode }} + dockerfile: ${{ matrix.dockerfile }} + maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} + image_date: ${{ needs.setup.outputs.image_date }} + + tpu-post-training: + name: ${{ matrix.image_name }} + needs: [setup, tpu-pre-training] + strategy: + fail-fast: false + matrix: + include: + - device: tpu + build_mode: post-training + image_name: maxtext_post_training_stable + dockerfile: ./dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile + - device: tpu + build_mode: post-training + image_name: maxtext_post_training_nightly + dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + image_name: ${{ matrix.image_name }} + device: ${{ matrix.device }} + build_mode: ${{ matrix.build_mode }} + dockerfile: ${{ matrix.dockerfile }} + maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} + image_date: ${{ needs.setup.outputs.image_date }} + is_post_training: true + + gpu-pre-training: + name: ${{ matrix.image_name }} + needs: setup + strategy: + fail-fast: false + matrix: + include: - device: gpu build_mode: stable image_name: maxtext_gpu_jax_stable @@ -62,74 +119,11 @@ jobs: build_mode: nightly image_name: maxtext_gpu_jax_nightly dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile - - if: > - github.event_name == 'schedule' || - github.event_name == 'pull_request' || - github.event_name == 'workflow_dispatch' && ( - github.event.inputs.target_device == 'all' || - github.event.inputs.target_device == 'tpu' || - github.event.inputs.target_device == 'gpu' - ) - - # Setup for GKE runners per b/412986220#comment82 and b/412986220#comment90 - steps: - - name: Check if build should run - id: check - shell: bash - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ matrix.device }}" ]]; then - echo "should_run=false" >> $GITHUB_OUTPUT - echo "Skipping build for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode." - else - echo "should_run=true" >> $GITHUB_OUTPUT - echo "Building for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode." - fi - - - name: Checkout git repository - uses: actions/checkout@v5 - if: steps.check.outputs.should_run == 'true' - - - name: Mark git repository as safe - if: steps.check.outputs.should_run == 'true' - run: git config --global --add safe.directory ${GITHUB_WORKSPACE} - - - name: Configure Docker - if: steps.check.outputs.should_run == 'true' - run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q - - - name: Set up Docker BuildX - uses: docker/setup-buildx-action@v3.11.1 - if: steps.check.outputs.should_run == 'true' - with: - driver: remote - endpoint: tcp://localhost:1234 - - # Env variables to be passed to Dockerfile - - name: Get metadata - id: vars - if: steps.check.outputs.should_run == 'true' - run: | - echo "commit_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT - - # Docker BuildX command config - - name: Build and Push Docker Image - uses: docker/build-push-action@v6 - if: steps.check.outputs.should_run == 'true' - with: - push: true - context: . - file: ${{ matrix.dockerfile }} - tags: | - gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:maxtext_${{ steps.vars.outputs.commit_hash }} - gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:${{ steps.vars.outputs.image_date }} - gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest - cache-from: type=gha - cache-to: type=gha,mode=max - provenance: false - build-args: | - DEVICE=${{ matrix.device }} - MODE=${{ matrix.build_mode }} - JAX_VERSION=NONE - LIBTPU_GCS_PATH=NONE + uses: ./.github/workflows/build_and_push_docker_image.yml + with: + image_name: ${{ matrix.image_name }} + device: ${{ matrix.device }} + build_mode: ${{ matrix.build_mode }} + dockerfile: ${{ matrix.dockerfile }} + maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} + image_date: ${{ needs.setup.outputs.image_date }} diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml new file mode 100644 index 0000000000..6b852c38a1 --- /dev/null +++ b/.github/workflows/build_and_push_docker_image.yml @@ -0,0 +1,140 @@ +# Copyright 2025 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow will build and push MaxText Docker image to GCR. + +name: Build and Push MaxText Docker Images + +on: + workflow_call: + inputs: + image_name: + required: true + type: string + device: + required: true + type: string + build_mode: + required: true + type: string + dockerfile: + required: true + type: string + maxtext_sha: + required: true + type: string + image_date: + required: true + type: string + is_post_training: + required: false + type: boolean + default: false + +permissions: + contents: read + +jobs: + build_and_push: + runs-on: linux-x86-n2-16-buildkit + container: google/cloud-sdk:524.0.0 + if: > + github.event_name == 'schedule' || + github.event_name == 'pull_request' || + github.event_name == 'workflow_dispatch' && ( + github.event.inputs.target_device == 'all' || + github.event.inputs.target_device == 'tpu' || + github.event.inputs.target_device == 'gpu' + ) + steps: + - name: Check if build should run + id: check + shell: bash + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ inputs.device }}" ]]; then + echo "should_run=false" >> $GITHUB_OUTPUT + echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode." + else + echo "should_run=true" >> $GITHUB_OUTPUT + echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode." + fi + + - name: Checkout MaxText + uses: actions/checkout@v5 + if: steps.check.outputs.should_run == 'true' + with: + # This ensures that every job clones the exact same commit as "setup" job + ref: ${{ inputs.maxtext_sha }} + + - name: Checkout post-training dependencies + if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly' + run: | + git clone https://github.com/google/tunix.git ./tunix + git clone https://github.com/vllm-project/vllm.git ./vllm + git clone https://github.com/vllm-project/tpu-inference.git ./tpu-inference + + - name: Mark git repositories as safe + run: git config --global --add safe.directory '*' + if: steps.check.outputs.should_run == 'true' + + - name: Configure Docker + run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q + if: steps.check.outputs.should_run == 'true' + + - name: Set up Docker BuildX + uses: docker/setup-buildx-action@v3.11.1 + if: steps.check.outputs.should_run == 'true' + with: + driver: remote + endpoint: tcp://localhost:1234 + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + if: steps.check.outputs.should_run == 'true' + with: + push: true + context: . + file: ${{ inputs.dockerfile }} + tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest + cache-from: type=gha + outputs: type=image,compression=zstd,force-compression=true + build-args: | + DEVICE=${{ inputs.device }} + MODE=${{ inputs.build_mode }} + JAX_VERSION=NONE + LIBTPU_GCS_PATH=NONE + BASEIMAGE=gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ inputs.image_date }} + + - name: Add tags to Docker image + if: steps.check.outputs.should_run == 'true' + shell: bash + run: | + SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}" + + # Add date tag + gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet + + # Add MaxText tag + maxtext_hash=$(git rev-parse --short HEAD) + gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}" --quiet + + # Add post-training dependencies tags + if [ "${{ inputs.is_post_training }}" == "true" ]; then + for dir in tunix vllm tpu-inference; do + if [ -d "./$dir" ]; then + dir_hash=$(git -C "$dir" rev-parse --short HEAD) + gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_$dir_hash" --quiet + fi + done + fi