Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 72 additions & 78 deletions .github/workflows/UploadDockerImages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,35 @@ on:
- tpu
- gpu

permissions:
contents: read

jobs:
build:
name: Build ${{ matrix.device }}-${{ matrix.build_mode }} Image
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
setup:
runs-on: ubuntu-latest
outputs:
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
image_date: ${{ steps.vars.outputs.image_date }}
steps:
- name: Checkout MaxText
uses: actions/checkout@v5

- name: Get metadata
id: vars
run: |
# MaxText SHA
echo "maxtext_sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT

# Image date
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT

# Use Github Actions matrix to run image builds in parallel
tpu-pre-training:
Comment thread Fixed
name: ${{ matrix.image_name }}
needs: setup
strategy:
fail-fast: false
matrix:
include:
# TPU Image Builds
- device: tpu
build_mode: stable
image_name: maxtext_jax_stable
Expand All @@ -53,7 +70,47 @@ jobs:
build_mode: nightly
image_name: maxtext_jax_nightly
dockerfile: ./dependencies/dockerfiles/maxtext_dependencies.Dockerfile
# GPU Image Builds
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}

tpu-post-training:
name: ${{ matrix.image_name }}
Comment thread Fixed
needs: [setup, tpu-pre-training]
strategy:
fail-fast: false
matrix:
include:
- device: tpu
build_mode: post-training
image_name: maxtext_post_training_stable
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_dependencies.Dockerfile
- device: tpu
build_mode: post-training
image_name: maxtext_post_training_nightly
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
is_post_training: true

gpu-pre-training:
name: ${{ matrix.image_name }}
Comment thread Fixed
needs: setup
strategy:
fail-fast: false
matrix:
include:
- device: gpu
build_mode: stable
image_name: maxtext_gpu_jax_stable
Expand All @@ -62,74 +119,11 @@ jobs:
build_mode: nightly
image_name: maxtext_gpu_jax_nightly
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile

if: >
github.event_name == 'schedule' ||
github.event_name == 'pull_request' ||
github.event_name == 'workflow_dispatch' && (
github.event.inputs.target_device == 'all' ||
github.event.inputs.target_device == 'tpu' ||
github.event.inputs.target_device == 'gpu'
)

# Setup for GKE runners per b/412986220#comment82 and b/412986220#comment90
steps:
- name: Check if build should run
id: check
shell: bash
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ matrix.device }}" ]]; then
echo "should_run=false" >> $GITHUB_OUTPUT
echo "Skipping build for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode."
else
echo "should_run=true" >> $GITHUB_OUTPUT
echo "Building for device: ${{ matrix.device }} in ${{ matrix.build_mode }} mode."
fi

- name: Checkout git repository
uses: actions/checkout@v5
if: steps.check.outputs.should_run == 'true'

- name: Mark git repository as safe
if: steps.check.outputs.should_run == 'true'
run: git config --global --add safe.directory ${GITHUB_WORKSPACE}

- name: Configure Docker
if: steps.check.outputs.should_run == 'true'
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q

- name: Set up Docker BuildX
uses: docker/setup-buildx-action@v3.11.1
if: steps.check.outputs.should_run == 'true'
with:
driver: remote
endpoint: tcp://localhost:1234

# Env variables to be passed to Dockerfile
- name: Get metadata
id: vars
if: steps.check.outputs.should_run == 'true'
run: |
echo "commit_hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT

# Docker BuildX command config
- name: Build and Push Docker Image
uses: docker/build-push-action@v6
if: steps.check.outputs.should_run == 'true'
with:
push: true
context: .
file: ${{ matrix.dockerfile }}
tags: |
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:maxtext_${{ steps.vars.outputs.commit_hash }}
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:${{ steps.vars.outputs.image_date }}
gcr.io/tpu-prod-env-multipod/${{ matrix.image_name }}:latest
cache-from: type=gha
cache-to: type=gha,mode=max
provenance: false
build-args: |
DEVICE=${{ matrix.device }}
MODE=${{ matrix.build_mode }}
JAX_VERSION=NONE
LIBTPU_GCS_PATH=NONE
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
140 changes: 140 additions & 0 deletions .github/workflows/build_and_push_docker_image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Copyright 2025 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This workflow will build and push MaxText Docker image to GCR.

name: Build and Push MaxText Docker Images

on:
workflow_call:
inputs:
image_name:
required: true
type: string
device:
required: true
type: string
build_mode:
required: true
type: string
dockerfile:
required: true
type: string
maxtext_sha:
required: true
type: string
image_date:
required: true
type: string
is_post_training:
required: false
type: boolean
default: false

permissions:
contents: read

jobs:
build_and_push:
runs-on: linux-x86-n2-16-buildkit
container: google/cloud-sdk:524.0.0
if: >
github.event_name == 'schedule' ||
github.event_name == 'pull_request' ||
github.event_name == 'workflow_dispatch' && (
github.event.inputs.target_device == 'all' ||
github.event.inputs.target_device == 'tpu' ||
github.event.inputs.target_device == 'gpu'
)
steps:
- name: Check if build should run
id: check
shell: bash
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ github.event.inputs.target_device }}" != "all" && "${{ github.event.inputs.target_device }}" != "${{ inputs.device }}" ]]; then
echo "should_run=false" >> $GITHUB_OUTPUT
echo "Skipping ${{ inputs.image_name }} build for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
else
echo "should_run=true" >> $GITHUB_OUTPUT
echo "Building ${{ inputs.image_name }} for device: ${{ inputs.device }} in ${{ inputs.build_mode }} mode."
fi

- name: Checkout MaxText
uses: actions/checkout@v5
if: steps.check.outputs.should_run == 'true'
with:
# This ensures that every job clones the exact same commit as "setup" job
ref: ${{ inputs.maxtext_sha }}

- name: Checkout post-training dependencies
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
run: |
git clone https://github.com/google/tunix.git ./tunix
git clone https://github.com/vllm-project/vllm.git ./vllm
git clone https://github.com/vllm-project/tpu-inference.git ./tpu-inference

- name: Mark git repositories as safe
run: git config --global --add safe.directory '*'
if: steps.check.outputs.should_run == 'true'

- name: Configure Docker
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
if: steps.check.outputs.should_run == 'true'

- name: Set up Docker BuildX
uses: docker/setup-buildx-action@v3.11.1
if: steps.check.outputs.should_run == 'true'
with:
driver: remote
endpoint: tcp://localhost:1234

- name: Build and push Docker image
uses: docker/build-push-action@v6
if: steps.check.outputs.should_run == 'true'
with:
push: true
context: .
file: ${{ inputs.dockerfile }}
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
cache-from: type=gha
outputs: type=image,compression=zstd,force-compression=true
build-args: |
DEVICE=${{ inputs.device }}
MODE=${{ inputs.build_mode }}
JAX_VERSION=NONE
LIBTPU_GCS_PATH=NONE
BASEIMAGE=gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ inputs.image_date }}

- name: Add tags to Docker image
if: steps.check.outputs.should_run == 'true'
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"

# Add date tag
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet

# Add MaxText tag
maxtext_hash=$(git rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}" --quiet

# Add post-training dependencies tags
if [ "${{ inputs.is_post_training }}" == "true" ]; then
for dir in tunix vllm tpu-inference; do
if [ -d "./$dir" ]; then
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_$dir_hash" --quiet
fi
done
fi
Comment thread Fixed
Loading