diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index 178db42a609a..b25f3b21e8eb 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -8,6 +8,8 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0" elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" +elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" fi @@ -15,6 +17,8 @@ fi # Compress the fatbin with -compress-mode=size for CUDA 13 if [[ "$DESIRED_CUDA" == *"13"* ]]; then export TORCH_NVCC_FLAGS="-compress-mode=size" + # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801 + export BUILD_BUNDLE_PTXAS=1 fi SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" @@ -31,8 +35,7 @@ pip install -r /pytorch/requirements.txt pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 @@ -46,6 +49,5 @@ else export USE_NVIDIA_PYPI_LIBS=1 fi - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index 1b6429fa8c06..a99e5f8f6565 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]: return check_output(["ls", "-1", path]).decode().split("\n") -def build_ArmComputeLibrary() -> None: - """ - Using ArmComputeLibrary for aarch64 PyTorch - """ - print("Building Arm Compute Library") - acl_build_flags = [ - "debug=0", - "neon=1", - "opencl=0", - "os=linux", - "openmp=1", - "cppthreads=0", - "arch=armv8a", - "multi_isa=1", - "fixed_format_kernels=1", - "build=native", - ] - acl_install_dir = "/acl" - acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary") - if os.path.isdir(acl_install_dir): - shutil.rmtree(acl_install_dir) - if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)): - check_call( - [ - "git", - "clone", - "https://github.com/ARM-software/ComputeLibrary.git", - "-b", - "v25.02", - "--depth", - "1", - "--shallow-submodules", - ] - ) - - check_call( - ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags, - cwd=acl_checkout_dir, - ) - for d in ["arm_compute", "include", "utils", "support", "src", "build"]: - shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}") - - def replace_tag(filename) -> None: with open(filename) as f: lines = f.readlines() @@ -317,7 +274,7 @@ def parse_arguments(): ).decode() print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "" # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: build_vars += "MAX_JOBS=5 " @@ -356,23 +313,17 @@ def parse_arguments(): build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " if enable_mkldnn: - build_ArmComputeLibrary() print("build pytorch with mkldnn+acl backend") - build_vars += ( - "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " - "ACL_ROOT_DIR=/acl " - "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " - "ACL_INCLUDE_DIR=/acl/build " - "ACL_LIBRARY=/acl/build " - ) + build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " + build_vars += "ACL_ROOT_DIR=/acl " if enable_cuda: build_vars += "BLAS=NVPL " else: - build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS " + build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS " else: print("build pytorch without mkldnn backend") - os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel") + os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation") if enable_cuda: print("Updating Cuda Dependency") filename = os.listdir("/pytorch/dist/") diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py index 7a4715d33006..a157ec57b574 100755 --- a/.ci/aarch64_linux/build_aarch64_wheel.py +++ b/.ci/aarch64_linux/build_aarch64_wheel.py @@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): try: with socket.create_connection((addr, port), timeout=timeout): return - except (ConnectionRefusedError, socket.timeout): # noqa: PERF203 + except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 if i == attempt_cnt - 1: raise time.sleep(timeout) @@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: ) -def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: - print("Building OpenBLAS") - host.run_cmd( - f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}" - ) - make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" - host.run_cmd( - f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS" - ) - - -def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: - print("Building Arm Compute Library") - acl_build_flags = " ".join( - [ - "debug=0", - "neon=1", - "opencl=0", - "os=linux", - "openmp=1", - "cppthreads=0", - "arch=armv8a", - "multi_isa=1", - "fixed_format_kernels=1", - "build=native", - ] - ) - host.run_cmd( - f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}" - ) - - host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") - - def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None: host.run_cmd("pip3 install auditwheel") host.run_cmd( @@ -442,7 +408,7 @@ def build_torchvision( if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel") + host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation") vision_wheel_name = host.list_dir("vision/dist")[0] embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name)) @@ -497,7 +463,7 @@ def build_torchdata( if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel") + host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation") wheel_name = host.list_dir("data/dist")[0] embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name)) @@ -553,7 +519,7 @@ def build_torchtext( if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" - host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel") + host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation") wheel_name = host.list_dir("text/dist")[0] embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name)) @@ -614,7 +580,7 @@ def build_torchaudio( host.run_cmd( f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \ && ./packaging/ffmpeg/build.sh \ - && {build_vars} python3 setup.py bdist_wheel" + && {build_vars} python3 -m build --wheel --no-isolation" ) wheel_name = host.list_dir("audio/dist")[0] @@ -700,7 +666,6 @@ def start_build( configure_system( host, compiler=compiler, use_conda=use_conda, python_version=python_version ) - build_OpenBLAS(host, git_clone_flags) if host.using_docker(): print("Move libgfortant.a into a standard location") @@ -723,10 +688,12 @@ def start_build( f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}" ) + host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh") + print("Building PyTorch wheel") build_opts = "" if pytorch_build_number is not None: - build_opts += f" --build-number {pytorch_build_number}" + build_opts += f" -C--build-option=--build-number={pytorch_build_number}" # Breakpad build fails on aarch64 build_vars = "USE_BREAKPAD=0 " if branch == "nightly": @@ -743,15 +710,18 @@ def start_build( if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" if enable_mkldnn: - build_ArmComputeLibrary(host, git_clone_flags) + host.run_cmd("pytorch/.ci/docker/common/install_acl.sh") print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" + build_vars += " BLAS=OpenBLAS" + build_vars += " OpenBLAS_HOME=/opt/OpenBLAS" + build_vars += " ACL_ROOT_DIR=/acl" host.run_cmd( - f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}" + f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" ) print("Repair the wheel") pytorch_wheel_name = host.list_dir("pytorch/dist")[0] - ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib" + ld_library_path = "/acl/build:$HOME/pytorch/build/lib" host.run_cmd( f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}" ) @@ -763,7 +733,7 @@ def start_build( else: print("build pytorch without mkldnn backend") host.run_cmd( - f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}" + f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" ) print("Deleting build folder") @@ -907,7 +877,7 @@ def terminate_instances(instance_type: str) -> None: def parse_arguments(): from argparse import ArgumentParser - parser = ArgumentParser("Builid and test AARCH64 wheels using EC2") + parser = ArgumentParser("Build and test AARCH64 wheels using EC2") parser.add_argument("--key-name", type=str) parser.add_argument("--debug", action="store_true") parser.add_argument("--build-only", action="store_true") @@ -1004,7 +974,7 @@ def parse_arguments(): install_condaforge_python(host, args.python_version) sys.exit(0) - python_version = args.python_version if args.python_version is not None else "3.9" + python_version = args.python_version if args.python_version is not None else "3.10" if args.use_torch_from_pypi: configure_system(host, compiler=args.compiler, python_version=python_version) diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile index 481d21b96cfe..ce7803cf9acd 100644 --- a/.ci/docker/almalinux/Dockerfile +++ b/.ci/docker/almalinux/Dockerfile @@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0 ENV DESIRED_CUDA=13.0 FROM ${ROCM_IMAGE} as rocm -ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +ARG PYTORCH_ROCM_ARCH +ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ADD ./common/install_mkl.sh install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh ENV MKLROOT /opt/intel diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh index ec15c13e439b..ad234ce1ffb9 100755 --- a/.ci/docker/almalinux/build.sh +++ b/.ci/docker/almalinux/build.sh @@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in ;; rocm*) BASE_TARGET=rocm + PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + # add gfx950, gfx115x conditionally starting in ROCm 7.0 + if [[ "$ROCM_VERSION" == *"7.0"* ]]; then + PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" + fi + EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" ;; *) echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}" diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index be85fdcb542d..a23c85bc60a5 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -84,8 +84,8 @@ fi _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96 if [[ "$image" == *rocm* ]]; then - _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6 - _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d + _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e + _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77 fi tag=$(echo $image | awk -F':' '{print $2}') @@ -113,6 +113,7 @@ case "$tag" in UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} TRITON=yes + INSTALL_MINGW=yes ;; pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11) CUDA_VERSION=13.0.0 @@ -175,28 +176,17 @@ case "$tag" in fi GCC_VERSION=11 VISION=yes - ROCM_VERSION=6.4 + ROCM_VERSION=7.0 NINJA_VERSION=1.9.0 TRITON=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} + PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100" if [[ $tag =~ "benchmarks" ]]; then INDUCTOR_BENCHMARKS=yes fi ;; - pytorch-linux-noble-rocm-alpha-py3) - ANACONDA_PYTHON_VERSION=3.12 - GCC_VERSION=11 - VISION=yes - ROCM_VERSION=7.0 - NINJA_VERSION=1.9.0 - TRITON=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950" - ;; pytorch-linux-jammy-xpu-n-1-py3) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 @@ -262,13 +252,10 @@ case "$tag" in TRITON_CPU=yes ;; pytorch-linux-jammy-linter) - # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. - # We will need to update mypy version eventually, but that's for another day. The task - # would be to upgrade mypy to 1.0.0 with Python 3.11 - PYTHON_VERSION=3.9 + PYTHON_VERSION=3.10 ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) - PYTHON_VERSION=3.9 + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) + PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 ;; pytorch-linux-jammy-aarch64-py3.10-gcc11) @@ -358,7 +345,7 @@ docker build \ --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "KATEX=${KATEX:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ - --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \ + --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \ --build-arg "IMAGE_NAME=${IMAGE_NAME}" \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \ @@ -375,6 +362,7 @@ docker build \ --build-arg "OPENBLAS=${OPENBLAS:-}" \ --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ + --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ @@ -455,12 +443,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then echo "expecting triton to not be installed, but it is" exit 1 fi - -# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if -# they support 4.0.0 yet, so exclude them from this check. -CMAKE_VERSION=$(drun cmake --version) -if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then - echo "CMake version is not 4.0.0:" - drun cmake --version - exit 1 -fi diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index 0e527f468229..f2e2d655a6cf 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1 @@ -56392aa978594cc155fa8af48cd949f5b5f1823a +deb42f2a8e48f5032b4a98ee781a15fa87a157cf diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt index 66e5dbdfb1bb..f4f3830136eb 100644 --- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt +++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt @@ -1,2 +1,2 @@ -transformers==4.54.0 +transformers==4.56.0 soxr==0.5.0 diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt index d099a6b91b76..77a73992346c 100644 --- a/.ci/docker/ci_commit_pins/nccl-cu12.txt +++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt @@ -1 +1 @@ -v2.27.5-1 +v2.27.5-1 \ No newline at end of file diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index e543da3aa161..10f1207e60e6 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -5ae38bdb0dc066c5823e34dc9797afb9de42c866 +7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh old mode 100644 new mode 100755 index bf41a03b2806..0b865e5bc6f8 --- a/.ci/docker/common/install_acl.sh +++ b/.ci/docker/common/install_acl.sh @@ -1,16 +1,27 @@ -set -euo pipefail +#!/bin/bash +# Script used only in CD pipeline -readonly version=v25.02 -readonly src_host=https://github.com/ARM-software -readonly src_repo=ComputeLibrary +set -eux -# Clone ACL -[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git -cd ${src_repo} +ACL_VERSION=${ACL_VERSION:-"v25.02"} +ACL_INSTALL_DIR="/acl" -git checkout $version +# Clone ACL +git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules +ACL_CHECKOUT_DIR="ComputeLibrary" # Build with scons +pushd $ACL_CHECKOUT_DIR scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \ os=linux arch=armv8a build=native multi_isa=1 \ fixed_format_kernels=1 openmp=1 cppthreads=0 +popd + +# Install ACL +sudo mkdir -p ${ACL_INSTALL_DIR} +for d in arm_compute include utils support src build +do + sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d} +done + +rm -rf $ACL_CHECKOUT_DIR \ No newline at end of file diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh index 692edd0b898f..c873c930097b 100755 --- a/.ci/docker/common/install_cpython.sh +++ b/.ci/docker/common/install_cpython.sh @@ -83,10 +83,6 @@ function build_cpython { py_suffix=${py_ver::-1} py_folder=$py_suffix fi - # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4 - if [ "$py_suffix" == "3.14.0" ]; then - py_suffix="3.14.0rc2" - fi wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz do_cpython_build $py_ver Python-$py_suffix diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index becd2264e395..fb168acd4feb 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -42,22 +42,27 @@ install_pip_dependencies() { # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current # numba and scipy version used in PyTorch CI conda_run pip uninstall -y numba scipy + # Yaspin is needed for running CI test (get_benchmark_analysis_data.py) + pip_install yaspin==3.1.0 popd } setup_executorch() { - pushd executorch - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON" as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true - popd } -clone_executorch -install_buck2 -install_conda_dependencies -install_pip_dependencies -setup_executorch +if [ $# -eq 0 ]; then + clone_executorch + install_buck2 + install_conda_dependencies + install_pip_dependencies + pushd executorch + setup_executorch + popd +else + "$@" +fi diff --git a/.ci/docker/common/install_mingw.sh b/.ci/docker/common/install_mingw.sh new file mode 100644 index 000000000000..6232a0d0245c --- /dev/null +++ b/.ci/docker/common/install_mingw.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -ex + +# Install MinGW-w64 for Windows cross-compilation +apt-get update +apt-get install -y g++-mingw-w64-x86-64-posix + +echo "MinGW-w64 installed successfully" +x86_64-w64-mingw32-g++ --version diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh index 9f23feb5adfa..b0615b8a84c1 100755 --- a/.ci/docker/common/install_onnx.sh +++ b/.ci/docker/common/install_onnx.sh @@ -19,8 +19,8 @@ pip_install \ transformers==4.36.2 pip_install coloredlogs packaging -pip_install onnxruntime==1.22.1 -pip_install onnxscript==0.4.0 +pip_install onnxruntime==1.23.0 +pip_install onnxscript==0.5.4 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh old mode 100644 new mode 100755 index 3c795acf2220..2f386c6bd523 --- a/.ci/docker/common/install_openblas.sh +++ b/.ci/docker/common/install_openblas.sh @@ -3,8 +3,10 @@ set -ex -cd / -git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules +OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"} + +# Clone OpenBLAS +git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules OPENBLAS_CHECKOUT_DIR="OpenBLAS" OPENBLAS_BUILD_FLAGS=" @@ -17,5 +19,7 @@ CFLAGS=-O3 BUILD_BFLOAT16=1 " -make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} -make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} +make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR +sudo make install -C $OPENBLAS_CHECKOUT_DIR + +rm -rf $OPENBLAS_CHECKOUT_DIR \ No newline at end of file diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index a156670cb815..7878311c15b0 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -42,12 +42,6 @@ EOF rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}" amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu" - # Special case for ROCM_VERSION == 7.0 - if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then - rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2" - amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu" - fi - # Add amdgpu repository UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index a8d8ba00b35b..9bf45e6f1b0a 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -12,8 +12,8 @@ function do_install() { rocm_version_nodot=${rocm_version//./} - # Version 2.7.2 + ROCm related updates - MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 + # https://github.com/icl-utk-edu/magma/pull/65 + MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" rocm_dir="/opt/rocm" diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index f48140952c3a..1b68e3c24783 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" # Triton needs at least gcc-9 to build apt-get install -y g++-9 - CXX=g++-9 conda_run python setup.py bdist_wheel + CXX=g++-9 conda_run python -m build --wheel --no-isolation elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then # Triton needs which surprisingly is not available with clang-9 toolchain add-apt-repository -y ppa:ubuntu-toolchain-r/test apt-get install -y g++-9 - CXX=g++-9 conda_run python setup.py bdist_wheel + CXX=g++-9 conda_run python -m build --wheel --no-isolation else - conda_run python setup.py bdist_wheel + conda_run python -m build --wheel --no-isolation fi # Copy the wheel to /opt for multi stage docker builds diff --git a/.ci/docker/common/patch_libstdc.sh b/.ci/docker/common/patch_libstdc.sh new file mode 100755 index 000000000000..7e3a00d0dad8 --- /dev/null +++ b/.ci/docker/common/patch_libstdc.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -xe +# Script used in Linux x86 and aarch64 CD pipeline + +# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols. +# see: https://github.com/pytorch/pytorch/issues/133437 +LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a) +nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20- > weaken-symbols.txt +objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh index 7caedf1f44d4..c40896cb5499 100755 --- a/.ci/docker/libtorch/build.sh +++ b/.ci/docker/libtorch/build.sh @@ -39,13 +39,21 @@ case ${DOCKER_TAG_PREFIX} in DOCKER_GPU_BUILD_ARG="" ;; rocm*) - # we want the patch version of 6.4 instead - if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then + # we want the patch version of 7.0 instead + if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" fi + # we want the patch version of 6.4 instead + if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then + GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4" + fi BASE_TARGET=rocm GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + # add gfx950, gfx115x conditionally starting in ROCm 7.0 + if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then + PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" + fi DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" ;; *) diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index 5d4d8dba690d..4803cb778c90 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \ /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \ done; - +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh # cmake-3.18.4 from pip; force in case cmake3 already exists RUN yum install -y python3-pip && \ diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index da7ab4d3fd15..768db0992936 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -62,6 +62,13 @@ ARG OPENBLAS_VERSION ADD ./common/install_openblas.sh install_openblas.sh RUN bash ./install_openblas.sh && rm install_openblas.sh +# Install Arm Compute Library +FROM base as arm_compute +# use python3.9 to install scons +RUN python3.9 -m pip install scons==4.7.0 +RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin +COPY ./common/install_acl.sh install_acl.sh +RUN bash ./install_acl.sh && rm install_acl.sh FROM base as final # remove unnecessary python versions @@ -70,4 +77,7 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/ -ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH +COPY --from=arm_compute /acl /acl +ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 index 369706055737..347a01ee4ede 100644 --- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 @@ -86,6 +86,15 @@ FROM base as nvpl ADD ./common/install_nvpl.sh install_nvpl.sh RUN bash ./install_nvpl.sh && rm install_nvpl.sh +# Install Arm Compute Library +FROM base as arm_compute +# use python3.9 to install scons +RUN python3.9 -m pip install scons==4.7.0 +RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin +COPY ./common/install_acl.sh install_acl.sh +RUN bash ./install_acl.sh && rm install_acl.sh +FROM base as final + FROM final as cuda_final ARG BASE_CUDA_VERSION RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION} @@ -93,5 +102,9 @@ COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BAS COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION} COPY --from=nvpl /opt/nvpl/lib/ /usr/local/lib/ COPY --from=nvpl /opt/nvpl/include/ /usr/local/include/ +COPY --from=arm_compute /acl /acl RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi deleted file mode 100644 index ed33cc61df09..000000000000 --- a/.ci/docker/manywheel/Dockerfile_cxx11-abi +++ /dev/null @@ -1,71 +0,0 @@ -FROM centos:8 as base - -ENV LC_ALL en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US.UTF-8 -ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - -# change to a valid repo -RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo -# enable to install ninja-build -RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo - -RUN yum -y update -RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo -RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++ - - -FROM base as openssl -ADD ./common/install_openssl.sh install_openssl.sh -RUN bash ./install_openssl.sh && rm install_openssl.sh - -# Install python -FROM base as python -RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel -ADD common/install_cpython.sh install_cpython.sh -RUN bash ./install_cpython.sh && rm install_cpython.sh - -FROM base as conda -ADD ./common/install_conda_docker.sh install_conda.sh -RUN bash ./install_conda.sh && rm install_conda.sh -RUN /opt/conda/bin/conda install -y cmake - -FROM base as intel -# Install MKL -COPY --from=python /opt/python /opt/python -COPY --from=python /opt/_internal /opt/_internal -COPY --from=conda /opt/conda /opt/conda -ENV PATH=/opt/conda/bin:$PATH -ADD ./common/install_mkl.sh install_mkl.sh -RUN bash ./install_mkl.sh && rm install_mkl.sh - -FROM base as patchelf -ADD ./common/install_patchelf.sh install_patchelf.sh -RUN bash ./install_patchelf.sh && rm install_patchelf.sh -RUN cp $(which patchelf) /patchelf - -FROM base as jni -ADD ./common/install_jni.sh install_jni.sh -ADD ./java/jni.h jni.h -RUN bash ./install_jni.sh && rm install_jni.sh - -FROM base as libpng -ADD ./common/install_libpng.sh install_libpng.sh -RUN bash ./install_libpng.sh && rm install_libpng.sh - -FROM base as final -COPY --from=openssl /opt/openssl /opt/openssl -COPY --from=python /opt/python /opt/python -COPY --from=python /opt/_internal /opt/_internal -COPY --from=intel /opt/intel /opt/intel -COPY --from=conda /opt/conda /opt/conda -COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf -COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h -COPY --from=libpng /usr/local/bin/png* /usr/local/bin/ -COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/ -COPY --from=libpng /usr/local/include/png* /usr/local/include/ -COPY --from=libpng /usr/local/include/libpng* /usr/local/include/ -COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/ -COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig - -RUN yum install -y ninja-build diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x index 46ec7f77ae8b..1cf83acb1c73 100644 --- a/.ci/docker/manywheel/Dockerfile_s390x +++ b/.ci/docker/manywheel/Dockerfile_s390x @@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio # cmake-3.28.0 from pip for onnxruntime RUN python3 -mpip install cmake==3.28.0 +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh + # build onnxruntime 1.21.0 from sources. # it is not possible to build it from sources using pip, # so just build it from upstream repository. diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 5dee4325857f..b4b505997303 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -28,6 +28,7 @@ fi MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-} DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-} OPENBLAS_VERSION=${OPENBLAS_VERSION:-} +ACL_VERSION=${ACL_VERSION:-} case ${image} in manylinux2_28-builder:cpu) @@ -41,13 +42,6 @@ case ${image} in GPU_IMAGE=arm64v8/almalinux:8 DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1" MANY_LINUX_VERSION="2_28_aarch64" - OPENBLAS_VERSION="v0.3.30" - ;; - manylinuxcxx11-abi-builder:cpu-cxx11-abi) - TARGET=final - GPU_IMAGE="" - DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" - MANY_LINUX_VERSION="cxx11-abi" ;; manylinuxs390x-builder:cpu-s390x) TARGET=final @@ -81,15 +75,23 @@ case ${image} in DOCKERFILE_SUFFIX="_cuda_aarch64" ;; manylinux2_28-builder:rocm*) - # we want the patch version of 6.4 instead - if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then + # we want the patch version of 7.0 instead + if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2" fi + # we want the patch version of 6.4 instead + if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then + GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4" + fi TARGET=rocm_final MANY_LINUX_VERSION="2_28" DEVTOOLSET_VERSION="11" GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + # add gfx950, gfx115x conditionally starting in ROCm 7.0 + if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then + PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151" + fi DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" ;; manylinux2_28-builder:xpu) @@ -121,7 +123,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') DOCKER_BUILDKIT=1 docker build \ ${DOCKER_GPU_BUILD_ARG} \ --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ - --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \ + --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \ + --build-arg "ACL_VERSION=${ACL_VERSION:-}" \ --target "${TARGET}" \ -t "${tmp_tag}" \ $@ \ diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py index 0fd7eb363144..c4df0eacbb7f 100644 --- a/.ci/docker/manywheel/build_scripts/ssl-check.py +++ b/.ci/docker/manywheel/build_scripts/ssl-check.py @@ -10,11 +10,6 @@ print("Testing SSL certificate checking for Python:", sys.version) -if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4): - print("This version never checks SSL certs; skipping tests") - sys.exit(0) - - EXC = OSError print(f"Connecting to {GOOD_SSL} should work") diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 4e08c0d6711e..04dc2b98eb66 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -10,6 +10,11 @@ boto3==1.35.42 #Pinned versions: 1.19.12, 1.16.34 #test that import: +build==1.3.0 +#Description: A simple, correct Python build frontend. +#Pinned versions: 1.3.0 +#test that import: + click #Description: Command Line Interface Creation Kit #Pinned versions: @@ -47,10 +52,10 @@ flatbuffers==24.12.23 #Pinned versions: 24.12.23 #test that import: -hypothesis==5.35.1 +hypothesis==6.56.4 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 #Description: advanced library for generating parametrized tests -#Pinned versions: 5.35.1 +#Pinned versions: 6.56.4 #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py junitparser==2.1.1 @@ -93,8 +98,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Pinned versions: #test that import: -mypy==1.16.0 +mypy==1.16.0 ; platform_system == "Linux" # Pin MyPy version because new errors are likely to appear with each release +# Skip on Windows as lots of type annotations are POSIX specific #Description: linter #Pinned versions: 1.16.0 #test that import: test_typing.py, test_type_hints.py @@ -105,20 +111,17 @@ networkx==2.8.8 #Pinned versions: 2.8.8 #test that import: functorch -ninja==1.11.1.3 +ninja==1.11.1.4 #Description: build system. Used in some tests. Used in build to generate build #time tracing information -#Pinned versions: 1.11.1.3 +#Pinned versions: 1.11.1.4 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py -numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x" -numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x" numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" #Description: Just-In-Time Compiler for Numerical Functions -#Pinned versions: 0.54.1, 0.49.0, <=0.49.1 +#Pinned versions: 0.55.2, 0.60.0 #test that import: test_numba_integration.py -#For numba issue see https://github.com/pytorch/pytorch/issues/51511 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073 #numpy @@ -133,7 +136,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_binary_ufuncs.py -numpy==1.22.4; python_version == "3.9" or python_version == "3.10" +numpy==1.22.4; python_version == "3.10" numpy==1.26.2; python_version == "3.11" or python_version == "3.12" numpy==2.1.2; python_version >= "3.13" @@ -165,12 +168,12 @@ optree==0.13.0 pillow==11.0.0 #Description: Python Imaging Library fork -#Pinned versions: 10.3.0 +#Pinned versions: 11.0.0 #test that import: -protobuf==5.29.4 +protobuf==5.29.5 #Description: Google's data interchange format -#Pinned versions: 5.29.4 +#Pinned versions: 5.29.5 #test that import: test_tensorboard.py, test/onnx/* psutil @@ -213,7 +216,7 @@ pytest-subtests==0.13.1 #Pinned versions: #test that import: -xdoctest==1.1.0 +xdoctest==1.3.0 #Description: runs doctests in pytest #Pinned versions: 1.1.0 #test that import: @@ -238,10 +241,9 @@ pygments==2.15.0 #Pinned versions: 14.1.0 #test that import: -scikit-image==0.19.3 ; python_version < "3.10" -scikit-image==0.22.0 ; python_version >= "3.10" +scikit-image==0.22.0 #Description: image processing routines -#Pinned versions: +#Pinned versions: 0.22.0 #test that import: test_nn.py #scikit-learn @@ -264,7 +266,7 @@ scipy==1.14.1 ; python_version >= "3.12" #test that import: # needed by torchgen utils -typing-extensions>=4.10.0 +typing-extensions==4.12.2 #Description: type hints for python #Pinned versions: #test that import: @@ -325,8 +327,6 @@ pywavelets==1.7.0 ; python_version >= "3.12" lxml==5.3.0 #Description: This is a requirement of unittest-xml-reporting -# Python-3.9 binaries - PyGithub==2.3.0 sympy==1.13.3 @@ -339,7 +339,7 @@ onnx==1.18.0 #Pinned versions: #test that import: -onnxscript==0.4.0 +onnxscript==0.5.3 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Pinned versions: #test that import: @@ -359,9 +359,10 @@ pwlf==2.2.1 #test that import: test_sac_estimator.py # To build PyTorch itself -pyyaml +pyyaml==6.0.2 pyzstd -setuptools>=70.1.0 +setuptools==78.1.1 +packaging==23.1 six scons==4.5.2 ; platform_machine == "aarch64" @@ -376,13 +377,16 @@ dataclasses_json==0.6.7 #Pinned versions: 0.6.7 #test that import: -cmake==4.0.0 +cmake==3.31.6 #Description: required for building tlparse==0.4.0 #Description: required for log parsing -cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" +filelock==3.18.0 +#Description: required for inductor testing + +cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin" #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits. #test that import: test_cuda.py diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index efe6fb4c949b..6e623b4c5694 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,8 +1,15 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2 +standard-imghdr==3.13.0; python_version >= "3.13" +#Description: This is needed by Sphinx, so it needs to be added here. +# The reasons are as follows: +# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr); +# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13. +# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency. + +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably # something related to Docker setup. We can investigate this later. diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 1edc8c60c2f0..3f22a1276921 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -103,6 +103,11 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt +ARG INSTALL_MINGW +COPY ./common/install_mingw.sh install_mingw.sh +RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi +RUN rm install_mingw.sh + ARG TRITON ARG TRITON_CPU diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py index 9833caca956c..c4d6f8a0b6f5 100644 --- a/.ci/lumen_cli/cli/lib/common/git_helper.py +++ b/.ci/lumen_cli/cli/lib/common/git_helper.py @@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules logger.info("Successfully cloned %s", target) return r, commit - except GitCommandError as e: - logger.error("Git operation failed: %s", e) + except GitCommandError: + logger.exception("Git operation failed") raise diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py index 98cfc807e284..8c106214ea9e 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py @@ -41,7 +41,6 @@ def sample_vllm_test_library(): "pytest -v -s basic_correctness/test_cumem.py", "pytest -v -s basic_correctness/test_basic_correctness.py", "pytest -v -s basic_correctness/test_cpu_offload.py", - "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py", ], }, "vllm_basic_models_test": { @@ -68,15 +67,12 @@ def sample_vllm_test_library(): "-v", "-s", "entrypoints/llm", - "--ignore=entrypoints/llm/test_lazy_outlines.py", "--ignore=entrypoints/llm/test_generate.py", - "--ignore=entrypoints/llm/test_generate_multiple_loras.py", "--ignore=entrypoints/llm/test_collective_rpc.py", ] ), - "pytest -v -s entrypoints/llm/test_lazy_outlines.py", - "pytest -v -s entrypoints/llm/test_generate.py ", - "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", + "pytest -v -s entrypoints/llm/test_generate.py", + "pytest -v -s entrypoints/offline_mode", ], }, "vllm_regression_test": { @@ -147,7 +143,7 @@ def sample_vllm_test_library(): "pytest -v -s compile/test_decorator.py", ], }, - "vllm_languagde_model_test_extended_generation_28_failure_test": { + "vllm_language_model_test_extended_generation_28_failure_test": { "title": "Language Models Test (Extended Generation) 2.8 release failure", "id": "vllm_languagde_model_test_extended_generation_28_failure_test", "package_install": [ diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py index 415e05d07551..63e5f7a28de5 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py @@ -63,7 +63,7 @@ class VllmBuildParameters: # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True" use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True) dockerfile_path: Path = env_path_field( - "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm" + "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile" ) # the cleaning script to remove torch dependencies from pip diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile index 5f63da87bc4d..9fca7ad54461 100644 --- a/.ci/magma-rocm/Makefile +++ b/.ci/magma-rocm/Makefile @@ -1,11 +1,11 @@ SHELL=/usr/bin/env bash DOCKER_CMD ?= docker -DESIRED_ROCM ?= 6.4 +DESIRED_ROCM ?= 7.0 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM)) PACKAGE_NAME = magma-rocm # inherit this from underlying docker image, do not pass this env var to docker -#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201 +#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ -v $(shell git rev-parse --show-toplevel)/.ci:/builder \ @@ -16,20 +16,20 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \ magma-rocm/build_magma.sh .PHONY: all +all: magma-rocm70 all: magma-rocm64 -all: magma-rocm63 .PHONY: clean: $(RM) -r magma-* $(RM) -r output +.PHONY: magma-rocm70 +magma-rocm70: DESIRED_ROCM := 7.0 +magma-rocm70: + $(DOCKER_RUN) + .PHONY: magma-rocm64 magma-rocm64: DESIRED_ROCM := 6.4 magma-rocm64: $(DOCKER_RUN) - -.PHONY: magma-rocm63 -magma-rocm63: DESIRED_ROCM := 6.3 -magma-rocm63: - $(DOCKER_RUN) diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh index 4acb3fb0dc3b..c7c7780227ea 100755 --- a/.ci/magma-rocm/build_magma.sh +++ b/.ci/magma-rocm/build_magma.sh @@ -6,8 +6,8 @@ set -eou pipefail # The script expects DESIRED_CUDA and PACKAGE_NAME to be set ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -# Version 2.7.2 + ROCm related updates -MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6 +# https://github.com/icl-utk-edu/magma/pull/65 +MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec # Folders for the build PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata @@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE # Fetch magma sources and verify checksum pushd ${PACKAGE_DIR} -git clone https://bitbucket.org/icl/magma.git +git clone https://github.com/jeffdaily/magma pushd magma git checkout ${MAGMA_VERSION} popd diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh index 4c268befb30e..b84268fd1289 100644 --- a/.ci/manywheel/build_common.sh +++ b/.ci/manywheel/build_common.sh @@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \ EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ - python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR + python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR echo "Finished setup.py bdist at $(date)" # Build libtorch packages diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 6ed38f8b25c6..2a822295e036 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then export USE_CUFILE=0 else DEPS_LIST+=( - "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libcublas.so.12" "/usr/local/cuda/lib64/libcublasLt.so.12" "/usr/local/cuda/lib64/libcudart.so.12" "/usr/local/cuda/lib64/libnvrtc.so.12" "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12") DEPS_SONAME+=( - "libnvToolsExt.so.1" "libcublas.so.12" "libcublasLt.so.12" "libcudart.so.12" "libnvrtc.so.12" "libcupti.so.12") + + if [[ $CUDA_VERSION != 12.9* ]]; then + DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1") + DEPS_SONAME+=("libnvToolsExt.so.1") + fi fi else echo "Using nvidia libs from pypi." diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh index 4de775b1823c..d78fbd5c3ed3 100644 --- a/.ci/manywheel/build_libtorch.sh +++ b/.ci/manywheel/build_libtorch.sh @@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr fi -echo "Calling 'python -m pip install .' at $(date)" +echo "Calling -m pip install . -v --no-build-isolation at $(date)" if [[ $LIBTORCH_VARIANT = *"static"* ]]; then STATIC_CMAKE_FLAG="-DTORCH_STATIC=1" diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh index ffc15bcdc5fa..bac56746f450 100755 --- a/.ci/manywheel/build_rocm.sh +++ b/.ci/manywheel/build_rocm.sh @@ -107,6 +107,10 @@ if [[ $ROCM_INT -ge 60200 ]]; then ROCM_SO_FILES+=("librocm-core.so") fi +if [[ $ROCM_INT -ge 70000 ]]; then + ROCM_SO_FILES+=("librocroller.so") +fi + OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then LIBGOMP_PATH="/usr/lib64/libgomp.so.1" diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index 1c88554c2af9..cae81a2568d5 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -89,7 +89,7 @@ fi if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then export USE_MKLDNN=1 export USE_MKLDNN_ACL=1 - export ACL_ROOT_DIR=/ComputeLibrary + export ACL_ROOT_DIR=/acl fi if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then @@ -233,7 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then export BUILD_STATIC_RUNTIME_BENCHMARK=ON fi -if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then + export CMAKE_BUILD_TYPE=Debug +elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then export CMAKE_BUILD_TYPE=RelWithAssert fi @@ -290,15 +292,20 @@ else WERROR=1 python setup.py clean - WERROR=1 python setup.py bdist_wheel + WERROR=1 python -m build --wheel --no-isolation else python setup.py clean if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then source .ci/pytorch/install_cache_xla.sh fi - python setup.py bdist_wheel + python -m build --wheel --no-isolation fi pip_install_whl "$(echo dist/*.whl)" + if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then + # Regression test for https://github.com/pytorch/pytorch/issues/164297 + # Torch should be importable and that's about it + pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd + fi if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then install_torchvision diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh index 6e417bf8bbe9..f085fa78bebe 100755 --- a/.ci/pytorch/cpp_doc_push_script.sh +++ b/.ci/pytorch/cpp_doc_push_script.sh @@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \ # Build the docs pushd docs/cpp -time make VERBOSE=1 html -j +time make VERBOSE=1 html popd popd diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh index d41c3c08e628..c01efda11ea6 100755 --- a/.ci/pytorch/macos-build.sh +++ b/.ci/pytorch/macos-build.sh @@ -35,11 +35,12 @@ fi print_cmake_info if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then - USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel + # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls + USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation else - # NB: we always build with distributed; USE_DISTRIBUTED turns off all - # backends (specifically the gloo backend), so test that this case works too - USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 + # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests + # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 + USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64 fi if which sccache > /dev/null; then print_sccache_stats diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 79d47da43171..2687852a2c4f 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available( fi popd -python -mpip install -r requirements.txt - # enable debug asserts in serialization export TORCH_SERIALIZATION_DEBUG=1 -python -mpip install --no-input -r requirements.txt - setup_test_python() { # The CircleCI worker hostname doesn't resolve to an address. # This environment variable makes ProcessGroupGloo default to @@ -59,7 +55,7 @@ test_python_shard() { setup_test_python - time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" + time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS" assert_git_not_dirty } @@ -260,7 +256,7 @@ test_torchbench_smoketest() { local device=mps local dtypes=(undefined float16 bfloat16 notset) local dtype=${dtypes[$1]} - local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) + local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16) for backend in eager inductor; do @@ -323,7 +319,7 @@ test_aoti_torchbench_smoketest() { local device=mps local dtypes=(undefined float16 bfloat16 notset) local dtype=${dtypes[$1]} - local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) + local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16) echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}" local dtype_arg="--${dtype}" diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh index 219463f318db..039459816724 100755 --- a/.ci/pytorch/multigpu-test.sh +++ b/.ci/pytorch/multigpu-test.sh @@ -26,6 +26,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering + time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering time python test/run_test.py --verbose -i distributed/test_store time python test/run_test.py --verbose -i distributed/test_symmetric_memory time python test/run_test.py --verbose -i distributed/test_pg_wrapper diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py index 3e88ffe4ffd7..b0c607659c72 100755 --- a/.ci/pytorch/smoke_test/check_binary_symbols.py +++ b/.ci/pytorch/smoke_test/check_binary_symbols.py @@ -32,6 +32,9 @@ "torch::", ) +# Patterns for detecting statically linked libstdc++ symbols +STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")] + def _apply_libtorch_symbols(symbols): return [ @@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]: return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]] -def grep_symbols(lib: str, patterns: list[Any]) -> list[str]: +def grep_symbols( + lib: str, patterns: list[Any], symbol_type: str | None = None +) -> list[str]: def _grep_symbols( symbols: list[tuple[str, str, str]], patterns: list[Any] ) -> list[str]: rc = [] for _s_addr, _s_type, s_name in symbols: + # Filter by symbol type if specified + if symbol_type and _s_type != symbol_type: + continue for pattern in patterns: if pattern.match(s_name): rc.append(s_name) @@ -80,6 +88,18 @@ def _get_symbols_chunk(i): return functools.reduce(list.__add__, (x.result() for x in tasks), []) +def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None: + cxx11_statically_linked_symbols = grep_symbols( + lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T" + ) + num_statically_linked_symbols = len(cxx11_statically_linked_symbols) + print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}") + if num_statically_linked_symbols > 0: + raise RuntimeError( + f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}" + ) + + def check_lib_symbols_for_abi_correctness(lib: str) -> None: print(f"lib: {lib}") cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS) @@ -107,6 +127,7 @@ def main() -> None: libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so") check_lib_symbols_for_abi_correctness(libtorch_cpu_path) + check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path) if __name__ == "__main__": diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 7290ff6c8954..3e2dc09ef495 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -34,12 +34,14 @@ fi # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 -NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) -if [ -n "$NUMBA_CUDA_DIR" ]; then - NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" - pushd "$NUMBA_CUDA_DIR" - patch -p4 <"$NUMBA_PATCH" - popd +if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then + NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) + if [ -n "$NUMBA_CUDA_DIR" ]; then + NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" + pushd "$NUMBA_CUDA_DIR" + patch -p4 <"$NUMBA_PATCH" + popd + fi fi echo "Environment variables:" @@ -322,20 +324,26 @@ test_python_shard() { # modify LD_LIBRARY_PATH to ensure it has the conda env. # This set of tests has been shown to be buggy without it for the split-build - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty } test_python() { # shellcheck disable=SC2086 - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION assert_git_not_dirty } test_python_smoke() { - # Smoke tests for H100 - time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + # Smoke tests for H100/B200 + time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running + assert_git_not_dirty +} + +test_python_smoke_b200() { + # Targeted smoke tests for B200 - staged approach to avoid too many failures + time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty } @@ -384,6 +392,7 @@ test_dynamo_wrapped_shard() { --exclude-distributed-tests \ --exclude-torch-export-tests \ --exclude-aot-dispatch-tests \ + --exclude-quantization-tests \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose \ --upload-artifacts-while-running @@ -428,7 +437,7 @@ test_inductor_distributed() { # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported # with if required # gpus aren't available - python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose + python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose assert_git_not_dirty } @@ -476,6 +485,22 @@ test_inductor_aoti() { /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile } +test_inductor_aoti_cross_compile_for_windows() { + + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + + # Set WINDOWS_CUDA_HOME environment variable + WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted" + export WINDOWS_CUDA_HOME + + echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME" + echo "Contents:" + ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true + + python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib" +} + test_inductor_cpp_wrapper_shard() { if [[ -z "$NUM_TEST_SHARDS" ]]; then echo "NUM_TEST_SHARDS must be defined to run a Python test shard" @@ -829,7 +854,7 @@ test_dynamo_benchmark() { elif [[ "${suite}" == "timm_models" ]]; then export TORCHBENCH_ONLY_MODELS="inception_v3" elif [[ "${suite}" == "torchbench" ]]; then - export TORCHBENCH_ONLY_MODELS="hf_Bert" + export TORCHBENCH_ONLY_MODELS="BERT_pytorch" fi fi test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@" @@ -860,13 +885,13 @@ test_inductor_torchbench_smoketest_perf() { mkdir -p "$TEST_REPORTS_DIR" python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \ - --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \ + --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \ --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" # The threshold value needs to be actively maintained to make this check useful python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4 # Check memory compression ratio for a few models - for test in hf_Albert timm_vision_transformer; do + for test in BERT_pytorch yolov3; do python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \ --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \ --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" @@ -877,7 +902,7 @@ test_inductor_torchbench_smoketest_perf() { done # Perform some "warm-start" runs for a few huggingface models. - for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do + for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \ --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" python benchmarks/dynamo/check_accuracy.py \ @@ -891,7 +916,7 @@ test_inductor_set_cpu_affinity(){ export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD" export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" - if [[ "${TEST_CONFIG}" != *aarch64* ]]; then + if [[ "$(uname -m)" != "aarch64" ]]; then # Use Intel OpenMP for x86 IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so" export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD" @@ -905,7 +930,7 @@ test_inductor_set_cpu_affinity(){ cores=$((cpus / thread_per_core)) # Set number of cores to 16 on aarch64 for performance runs - if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then + if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then cores=16 fi export OMP_NUM_THREADS=$cores @@ -1156,6 +1181,12 @@ test_distributed() { fi } +test_quantization() { + echo "Testing quantization" + + python test/test_quantization.py +} + test_rpc() { echo "Testing RPC C++ tests" # NB: the ending test_rpc must match the current function name for the current @@ -1402,7 +1433,7 @@ EOF pip3 install -r requirements.txt # shellcheck source=./common-build.sh source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" - python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist" + python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist" python -mpip install base_dist/*.whl echo "::endgroup::" @@ -1550,14 +1581,10 @@ test_executorch() { install_torchvision install_torchaudio - pushd /executorch - - export PYTHON_EXECUTABLE=python - export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" + INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh" - # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch - # from the PR - bash .ci/scripts/setup-linux.sh --build-tool cmake + pushd /executorch + "${INSTALL_SCRIPT}" setup_executorch echo "Run ExecuTorch unit tests" pytest -v -n auto @@ -1571,17 +1598,13 @@ test_executorch() { popd - # Test torchgen generated code for Executorch. - echo "Testing ExecuTorch op registration" - "$BUILD_BIN_DIR"/test_edge_op_registration - assert_git_not_dirty } test_linux_aarch64() { python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \ - test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \ + test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \ distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \ --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose @@ -1608,11 +1631,12 @@ test_operator_benchmark() { TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" TEST_DIR=$(pwd) + ARCH=$(uname -m) test_inductor_set_cpu_affinity cd benchmarks/operator_benchmark/pt_extension - python -m pip install . + python -m pip install . -v --no-build-isolation cd "${TEST_DIR}"/benchmarks/operator_benchmark $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \ @@ -1622,9 +1646,28 @@ test_operator_benchmark() { pip_install pandas python check_perf_csv.py \ --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \ - --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" + --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv" } +test_operator_microbenchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + TEST_DIR=$(pwd) + + cd benchmarks/operator_benchmark/pt_extension + python -m pip install . + + cd "${TEST_DIR}"/benchmarks/operator_benchmark + + for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do + $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \ + --benchmark-name "PyTorch operator microbenchmark" --use-compile + $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \ + --benchmark-name "PyTorch operator microbenchmark" + done +} if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") @@ -1640,7 +1683,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0 fi python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py -elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then +elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then test_linux_aarch64 elif [[ "${TEST_CONFIG}" == *backward* ]]; then test_forward_backward_compatibility @@ -1657,6 +1700,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then test_executorch elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then test_python_legacy_jit +elif [[ "$TEST_CONFIG" == 'quantization' ]]; then + test_quantization elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" @@ -1679,6 +1724,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then test_operator_benchmark cpu ${TEST_MODE} fi +elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then + test_operator_microbenchmark elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then @@ -1687,6 +1734,8 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then test_inductor_triton_cpu elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then test_inductor_micro_benchmark +elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then + test_inductor_aoti_cross_compile_for_windows elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then install_torchvision id=$((SHARD_NUMBER-1)) @@ -1781,10 +1830,14 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then test_xpu_bin elif [[ "${TEST_CONFIG}" == smoke ]]; then test_python_smoke +elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then + test_python_smoke_b200 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then test_h100_distributed elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then test_h100_symm_mem +elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then + test_h100_symm_mem elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then test_h100_cutlass_backend else diff --git a/.ci/pytorch/test_fa3_abi_stable.sh b/.ci/pytorch/test_fa3_abi_stable.sh new file mode 100755 index 000000000000..ff71e9887293 --- /dev/null +++ b/.ci/pytorch/test_fa3_abi_stable.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -ex -o pipefail + +# Suppress ANSI color escape sequences +export TERM=vt100 + +# shellcheck source=./common.sh +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" +# shellcheck source=./common-build.sh +source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh" + +echo "Environment variables" +env + +echo "Testing FA3 stable wheel still works with currently built torch" + +echo "Installing ABI Stable FA3 wheel" +# The wheel was built on https://github.com/Dao-AILab/flash-attention/commit/b3846b059bf6b143d1cd56879933be30a9f78c81 +# on torch nightly torch==2.9.0.dev20250830+cu129 +$MAYBE_SUDO pip -q install https://s3.amazonaws.com/ossci-linux/wheels/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl + +pushd flash-attention/hopper +export PYTHONPATH=$PWD +pytest -v -s \ + "test_flash_attn.py::test_flash_attn_output[1-1-192-False-False-False-0.0-False-False-mha-dtype0]" \ + "test_flash_attn.py::test_flash_attn_varlen_output[511-1-64-True-False-False-0.0-False-False-gqa-dtype2]" \ + "test_flash_attn.py::test_flash_attn_kvcache[1-128-128-False-False-True-None-0.0-False-False-True-False-True-False-gqa-dtype0]" \ + "test_flash_attn.py::test_flash_attn_race_condition[97-97-192-True-dtype0]" \ + "test_flash_attn.py::test_flash_attn_combine[2-3-64-dtype1]" \ + "test_flash_attn.py::test_flash3_bw_compatibility" +popd diff --git a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 index 29b3e913439c..a165f2a222ca 100644 --- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 +++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 @@ -70,7 +70,7 @@ sccache --zero-stats sccache --show-stats # Build the wheel -python setup.py bdist_wheel +python -m build --wheel --no-build-isolation if ($LASTEXITCODE -ne 0) { exit 1 } # Install the wheel locally diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 19d715b9d0b6..240cc8b55932 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -38,10 +38,12 @@ if errorlevel 1 goto fail if not errorlevel 0 goto fail :: Update CMake +:: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9 if errorlevel 1 goto fail if not errorlevel 0 goto fail +:: TODO: Move to .ci/docker/requirements-ci.txt call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0 if errorlevel 1 goto fail if not errorlevel 0 goto fail @@ -130,14 +132,14 @@ if "%USE_CUDA%"=="1" ( :: Print all existing environment variable for debugging set -python setup.py bdist_wheel +python -m build --wheel --no-isolation if errorlevel 1 goto fail if not errorlevel 0 goto fail sccache --show-stats python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" ( if "%BUILD_ENVIRONMENT%"=="" ( - echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. + echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. ) else ( copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat index 01e08c8bb4e5..abd2c8722b11 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat @@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" ( ) else ( set CONDA_PARENT_DIR=C:\Jenkins ) - +set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 :: Be conservative here when rolling out the new AMI with conda. This will try :: to install conda as before if it couldn't find the conda installation. This :: can be removed eventually after we gain enough confidence in the AMI -if not exist %CONDA_PARENT_DIR%\Miniconda3 ( +if not exist %CONDA_ROOT_DIR% ( set INSTALL_FRESH_CONDA=1 ) @@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( if errorlevel 1 exit /b if not errorlevel 0 exit /b - %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 + %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% if errorlevel 1 exit /b if not errorlevel 0 exit /b ) :: Activate conda so that we can use its commands, i.e. conda, python, pip -call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 +call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% +:: Activate conda so that we can use its commands, i.e. conda, python, pip +call conda activate py_tmp + +call pip install -r .ci/docker/requirements-ci.txt diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 4a464d6b5786..3173582b06f4 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -14,7 +14,7 @@ if not errorlevel 0 exit /b :: build\torch. Rather than changing all these references, making a copy of torch folder :: from conda to the current workspace is easier. The workspace will be cleaned up after :: the job anyway -xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ pushd . if "%VC_VERSION%" == "" ( diff --git a/.ci/pytorch/win-test-helpers/test_libtorch.bat b/.ci/pytorch/win-test-helpers/test_libtorch.bat index ed80fadbc25c..d6ecd7218876 100644 --- a/.ci/pytorch/win-test-helpers/test_libtorch.bat +++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat @@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1 if not errorlevel 0 exit /b 1 cd %TMP_DIR_WIN%\build\torch\test + +:: Enable delayed variable expansion to make the list +setlocal enabledelayedexpansion +set EXE_LIST= for /r "." %%a in (*.exe) do ( - call :libtorch_check "%%~na" "%%~fa" + if "%%~na" == "c10_intrusive_ptr_benchmark" ( + @REM NB: This is not a gtest executable file, thus couldn't be handled by + @REM pytest-cpp and is excluded from test discovery by run_test + call "%%~fa" if errorlevel 1 goto fail + if not errorlevel 0 goto fail + ) else ( + if "%%~na" == "verify_api_visibility" ( + @REM Skip verify_api_visibility as it is a compile-level test + ) else ( + set EXE_LIST=!EXE_LIST! cpp/%%~na + ) + ) ) -goto :eof - -:libtorch_check - cd %CWD% set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test -:: Skip verify_api_visibility as it a compile level test -if "%~1" == "verify_api_visibility" goto :eof +:: Run python test\run_test.py on the list +set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST! +if errorlevel 1 goto fail +if not errorlevel 0 goto fail -echo Running "%~2" -if "%~1" == "c10_intrusive_ptr_benchmark" ( - :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp - call "%~2" - goto :eof -) - -python test\run_test.py --cpp --verbose -i "cpp/%~1" -if errorlevel 1 ( - echo %1 failed with exit code %errorlevel% - goto fail -) -if not errorlevel 0 ( - echo %1 failed with exit code %errorlevel% - goto fail -) +goto :eof :eof exit /b 0 diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat index d0fa3babe59d..02829ee36975 100644 --- a/.ci/pytorch/win-test-helpers/test_python_shard.bat +++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat @@ -25,7 +25,7 @@ echo Copying over test times file robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files" echo Run nn tests -python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose +python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose if ERRORLEVEL 1 goto fail popd diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index 43524dc04e3f..a01aa0b6431c 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -37,23 +37,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" fi -# TODO: Move both of them to Windows AMI -python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 - -# Install Z3 optional dependency for Windows builds. -python -m pip install z3-solver==4.15.1.0 - -# Install tlparse for test\dynamo\test_structured_trace.py UTs. -python -m pip install tlparse==0.4.0 - -# Install parameterized -python -m pip install parameterized==0.8.1 - -# Install pulp for testing ilps under torch\distributed\_tools -python -m pip install pulp==2.9.0 - -# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308 -python -m pip install expecttest==0.3.0 +# TODO: Move this to .ci/docker/requirements-ci.txt +python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2" run_tests() { # Run nvidia-smi if available diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat index 3363a2d08846..b5c2ef65b84a 100644 --- a/.ci/pytorch/windows/arm64/build_pytorch.bat +++ b/.ci/pytorch/windows/arm64/build_pytorch.bat @@ -48,7 +48,7 @@ sccache --zero-stats sccache --show-stats :: Call PyTorch build script -python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%" :: show sccache stats sccache --show-stats diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat index bbdfb4bd1bb7..bbd349e2efb4 100644 --- a/.ci/pytorch/windows/cuda128.bat +++ b/.ci/pytorch/windows/cuda128.bat @@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" ( ) IF "%BUILD_VISION%" == "" ( - set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0 + set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0 set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( - set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 + set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 ) set "CUDA_PATH=%CUDA_PATH_V128%" diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat index 84d0f9caccef..86626e15fbc4 100644 --- a/.ci/pytorch/windows/internal/install_python.bat +++ b/.ci/pytorch/windows/internal/install_python.bat @@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t if errorlevel 1 exit /b 1 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%" -%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel +%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build if errorlevel 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat index 71056540464c..34a5140cb1ee 100644 --- a/.ci/pytorch/windows/internal/setup.bat +++ b/.ci/pytorch/windows/internal/setup.bat @@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_ goto build_end :pytorch -%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%" +%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%" :build_end IF ERRORLEVEL 1 exit /b 1 diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat index 5f23a63d5c20..cd1fc484ae15 100644 --- a/.ci/pytorch/windows/internal/static_lib_test.bat +++ b/.ci/pytorch/windows/internal/static_lib_test.bat @@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1 call %CONDA_HOME%\condabin\activate.bat testenv if errorlevel 1 exit /b 1 -call conda install -y -q -c conda-forge libuv=1.39 +call conda install -y -q -c conda-forge libuv=1.51 call conda install -y -q intel-openmp echo "install and test libtorch" diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat index dbdc9891324c..a7addd5d712d 100644 --- a/.ci/pytorch/windows/setup_build.bat +++ b/.ci/pytorch/windows/setup_build.bat @@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake %PYTHON_EXEC% -m pip install pyyaml %PYTHON_EXEC% -m pip install mkl-include mkl-static -%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0 +%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0 where cmake.exe diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index 98b50c0ceeaf..6123e8abc8c0 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -143,7 +143,8 @@ case $desired_python in RENAME_WHEEL=false ;; 3.13t) - echo "Using 3.13 deps" + echo "Using 3.13t deps" + mac_version='macosx-11.0-arm64' NUMPY_PINNED_VERSION="==2.1.0" RENAME_WHEEL=false ;; @@ -177,8 +178,7 @@ source ~/${desired_python}-build/bin/activate retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt" retry brew install libomp -# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which -# is build as part of tensorpipe submodule +# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule export USE_DISTRIBUTED=1 export USE_MKLDNN=OFF @@ -186,11 +186,11 @@ export USE_QNNPACK=OFF export BUILD_TEST=OFF pushd "$pytorch_rootdir" -echo "Calling setup.py bdist_wheel at $(date)" +echo "Calling -m build --wheel --no-isolation at $(date)" -_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}" +_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}" -echo "Finished setup.py bdist_wheel at $(date)" +echo "Finished -m build --wheel --no-isolation at $(date)" if [[ $package_type != 'libtorch' ]]; then echo "delocating wheel dependencies" diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index f5b949858d60..f12a3ac07517 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - -# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT -TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" - -# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries. -if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then - TRITON_CONSTRAINT="platform_system == 'Linux'" -fi +TRITON_CONSTRAINT="platform_system == 'Linux'" if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" diff --git a/.circleci/scripts/functorch_doc_push_script.sh b/.circleci/scripts/functorch_doc_push_script.sh deleted file mode 100755 index 010956e21252..000000000000 --- a/.circleci/scripts/functorch_doc_push_script.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# =================== The following code **should** be executed inside Docker container =================== - -# Install dependencies -sudo apt-get -y update -sudo apt-get -y install expect-dev - -# This is where the local pytorch install in the docker image is located -pt_checkout="/var/lib/jenkins/workspace" -source "$pt_checkout/.ci/pytorch/common_utils.sh" -echo "functorch_doc_push_script.sh: Invoked with $*" - -set -ex - -version=${DOCS_VERSION:-nightly} -echo "version: $version" - -# Build functorch docs -pushd $pt_checkout/functorch/docs -pip -q install -r requirements.txt -make html -popd - -git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages -pushd functorch_ghpages - -if [ $version == "main" ]; then - version=nightly -fi - -git rm -rf "$version" || true -mv "$pt_checkout/functorch/docs/build/html" "$version" - -git add "$version" || true -git status -git config user.email "soumith+bot@pytorch.org" -git config user.name "pytorchbot" -# If there aren't changes, don't make a commit; push is no-op -git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true -git status - -if [[ "${WITH_PUSH:-}" == true ]]; then - git push -u origin gh-pages -fi - -popd -# =================== The above code **should** be executed inside Docker container =================== diff --git a/.clang-tidy b/.clang-tidy index 4b1548d646b2..71ffdf8cb224 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -59,16 +59,19 @@ performance-*, -performance-enum-size, readability-container-size-empty, readability-delete-null-pointer, -readability-duplicate-include +readability-duplicate-include, readability-misplaced-array-index, -readability-redundant* +readability-redundant*, readability-simplify-subscript-expr, readability-string-compare, -readability-redundant-access-specifiers, -readability-redundant-control-flow, +-readability-redundant-inline-specifier, ' HeaderFilterRegex: '^(aten/|c10/|torch/).*$' WarningsAsErrors: '*' +LineFilter: + - name: '/usr/include/.*' CheckOptions: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true diff --git a/.flake8 b/.flake8 index fa73b7b880fd..937234edb403 100644 --- a/.flake8 +++ b/.flake8 @@ -7,16 +7,12 @@ max-line-length = 120 # C408 ignored because we like the dict keyword argument syntax # E501 is not flexible enough, we're using B950 instead ignore = - E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, + E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824, # shebang has extra meaning in fbcode lints, so I think it's not worth trying # to line this up with executable bit EXE001, # these ignores are from flake8-bugbear; please fix! - B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910 - # these ignores are from flake8-comprehensions; please fix! - C407, - # these ignores are from flake8-logging-format; please fix! - G100,G101,G200 + B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910 # these ignores are from flake8-simplify. please fix or ignore with commented reason SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12, # SIM104 is already covered by pyupgrade ruff diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md index a7e7006aaea5..1ed74161f55d 100644 --- a/.github/ISSUE_TEMPLATE/ci-sev.md +++ b/.github/ISSUE_TEMPLATE/ci-sev.md @@ -1,9 +1,14 @@ --- name: "⚠️ CI SEV" about: Tracking incidents for PyTorch's CI infra. +title: '' +labels: '' +assignees: '' + --- > NOTE: Remember to label this issue with "`ci: sev`" +> If you want autorevert to be disabled, keep the ci: disable-autorevert label diff --git a/.github/ISSUE_TEMPLATE/disable-autorevert.md b/.github/ISSUE_TEMPLATE/disable-autorevert.md new file mode 100644 index 000000000000..a76f2e4222eb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md @@ -0,0 +1,18 @@ +--- +name: "D❌​\U0001F519​ ISABLE AUTOREVERT" +about: Disables autorevert when open +title: "[DISABLE AUTOREVERT]" +labels: 'ci: disable-autorevert' +assignees: '' + +--- + +This issue, while open, disables the autorevert functionality. + +More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md) + + +## Why are you disabling autorevert? + + +## Links to any issues/commits/errors that shows the source of problem diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md index 8bea044cfd4b..d9e0cc22bd3f 100644 --- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md +++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md @@ -1,8 +1,10 @@ --- name: Disable CI jobs (PyTorch Dev Infra only) about: Use this template to disable CI jobs -title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]" -labels: "module: ci" +title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME] +labels: 'module: ci' +assignees: '' + --- > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index d4a7df9d5805..2c49247c0aa5 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -22,6 +22,9 @@ self-hosted-runner: - linux.arm64.m7g.4xlarge - linux.arm64.m7g.4xlarge.ephemeral - linux.arm64.r7g.12xlarge.memory + - linux.aws.h100 + - linux.aws.h100.4 + - linux.aws.h100.8 - linux.4xlarge.nvidia.gpu - linux.8xlarge.nvidia.gpu - linux.16xlarge.nvidia.gpu @@ -51,12 +54,17 @@ self-hosted-runner: - windows-11-arm64 - windows-11-arm64-preview # Organization-wide AMD-hosted runners - # MI2xx runners + # MI2xx non-ARC runners - linux.rocm.gpu - - linux.rocm.gpu.mi250 - linux.rocm.gpu.2 - linux.rocm.gpu.4 - # gfx942 runners + - linux.rocm.gpu.mi250 + - linux.rocm.gpu.gfx1100 + # MI2xx ARC runners + - linux.rocm.gpu.mi250.1 + - linux.rocm.gpu.mi250.2 + - linux.rocm.gpu.mi250.4 + # gfx942 ARC runners - linux.rocm.gpu.gfx942.1 - linux.rocm.gpu.gfx942.2 - linux.rocm.gpu.gfx942.4 diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml index c0c727d93ac6..049c3ce28e45 100644 --- a/.github/actions/build-external-packages/action.yml +++ b/.github/actions/build-external-packages/action.yml @@ -65,7 +65,7 @@ runs: cd .ci/lumen_cli python3 -m pip install -e . ) - MAX_JOBS="$(nproc --ignore=6)" + MAX_JOBS="$(nproc --ignore=10)" export MAX_JOBS # Split the comma-separated list and build each target diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml index 32fe1d7385b1..f29d776402ba 100644 --- a/.github/actions/linux-test/action.yml +++ b/.github/actions/linux-test/action.yml @@ -274,8 +274,6 @@ runs: -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" ) - # Propagate download.pytorch.org IP to container - grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py index def0276a9c8a..48a849098594 100644 --- a/.github/actions/reuse-old-whl/reuse_old_whl.py +++ b/.github/actions/reuse-old-whl/reuse_old_whl.py @@ -264,7 +264,7 @@ def change_content_to_new_version(file: Union[str, Path]) -> None: change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py") for file in Path(f"artifacts/dist/{old_stem}").glob( - "*.dist-info/**", + "*.dist-info/*", ): change_content_to_new_version(file) diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index 5af32ac03497..4370549e4801 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -28,6 +28,10 @@ runs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" + - name: Print GPU info (if present) + shell: bash + run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi + - name: Check if in a container runner shell: bash id: check_container_runner @@ -82,37 +86,6 @@ runs: # Prune all of the docker images docker system prune -af - - name: Manually resolve download.pytorch.org - shell: bash - continue-on-error: true - run: | - set +e - set -x - - PT_DOMAIN=download.pytorch.org - # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400, - # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last - # one is returned at random - RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1) - - if [ -z "${RESOLVED_IP}" ]; then - echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..." - RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1) - - if [ -z "${RESOLVED_IP}" ]; then - echo "Couldn't resolve ${PT_DOMAIN}, exiting..." - exit 1 - fi - fi - - if grep -r "${PT_DOMAIN}" /etc/hosts; then - # Clean up any old records first - sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts - fi - - echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts - cat /etc/hosts - - name: Check that the docker daemon is running shell: bash continue-on-error: true diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index a58db801b1cf..07c649985b79 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -111,3 +111,16 @@ runs: # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries. # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary. echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}" + + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 93c957896b5e..2ea330f93b49 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -6,6 +6,12 @@ inputs: cuda-version: description: which cuda version to install, 'cpu' for none required: true + python-version: + required: false + type: string + default: "3.10" + description: | + The python version to be used. Will be 3.10 by default runs: using: composite @@ -38,18 +44,24 @@ runs: CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" { + echo "CONDA=${CONDA}"; echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_INSTALL=${CONDA} install"; } >> "${GITHUB_ENV}" - name: Setup Python3 + env: + PYTHON_VERSION: ${{ inputs.python-version }} shell: bash run: | set +e set -x - PYTHON3=$(${CONDA_RUN} which python3) + # Create new py_tmp env with python-version + ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv + + PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then @@ -62,7 +74,7 @@ runs: # installation, which is Python 3 based. Its Python is default to Python 3. Further, there # is also the Miniconda installation that is Python 2 based, and both can be installed if # needed. In both cases, Python binary is just called python - PYTHON=$(${CONDA_RUN} which python) + PYTHON=$(${CONDA_RUN} -n py_tmp which python) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml index 799b20812b96..b5e5f74db037 100644 --- a/.github/actions/teardown-win/action.yml +++ b/.github/actions/teardown-win/action.yml @@ -23,9 +23,6 @@ runs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - - name: Clean up leftover processes on non-ephemeral Windows runner - uses: pytorch/test-infra/.github/actions/cleanup-runner@main - # Cleaning up Windows workspace sometimes fails flakily with device or resource busy # error, meaning one or more processes haven't stopped completely yet. So trying to # retry this step several time similar to how checkout-pytorch GHA does diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml index d4b8be8b609a..991cf9fb87ef 100644 --- a/.github/actions/test-pytorch-binary/action.yml +++ b/.github/actions/test-pytorch-binary/action.yml @@ -33,10 +33,6 @@ runs: ) echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" - if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then - # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner - grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts" - fi docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh" # Generate test script diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 05e0b684b427..8af554d56ee5 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -87ff22e49ed0e92576c4935ccb8c143daac4a3cd +69bbe7363897764f9e758d851cd0340147d27f94 diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index 4a57d6e374bd..6cc41d703bd5 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -966da7e46f65d6d49df3e31214470a4fe5cc8e66 +faffd5cf673615583da6517275e361cb3dbc77e6 diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt index 8ac38f3e1f4c..45ad7752358c 100644 --- a/.github/ci_commit_pins/vllm.txt +++ b/.github/ci_commit_pins/vllm.txt @@ -1 +1 @@ -5963b98b465007e3cfb0d39447e4459a8afa96dc +e5192819208c4d68194844b7dfafbc00020d0dea diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 504d924ec764..1bac2adbb56d 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -c77852e117bdf056c8e9a087e51d6f65cf6ba53d +0fa6e3129e61143224663e1ec67980d12b7ec4eb diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile similarity index 61% rename from .github/ci_configs/vllm/Dockerfile.tmp_vllm rename to .github/ci_configs/vllm/Dockerfile index a1b68ad28210..1aefa1be9831 100644 --- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm +++ b/.github/ci_configs/vllm/Dockerfile @@ -1,59 +1,41 @@ -# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo -# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing - ARG CUDA_VERSION=12.8.1 ARG PYTHON_VERSION=3.12 # BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine, # by default, it uses the torch-nightly-base stage from this docker image ARG BUILD_BASE_IMAGE=torch-nightly-base - -# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer, -# by default, it uses devel-ubuntu22.04 official image. ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" - #################### TORCH NIGHTLY BASE IMAGE #################### -# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base ARG CUDA_VERSION ARG PYTHON_VERSION ARG GET_PIP_URL -# Install Python and other dependencies +# Install system dependencies and uv, then create Python virtual environment RUN apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl wget sudo vim \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ + && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \ + && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \ + && ln -s /opt/venv/bin/python3 /usr/bin/python3 \ + && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \ + && ln -s /opt/venv/bin/pip /usr/bin/pip \ && python3 --version && python3 -m pip --version # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels -# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519) -RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \ - if command -v apt-get >/dev/null; then \ - if [ "$current_gcc_version" -lt 10 ]; then \ - echo "GCC version is $current_gcc_version, installing gcc-10..."; \ - apt-get update \ - && apt-get install -y gcc-10 g++-10 \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \ - && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \ - else \ - echo "GCC version is $current_gcc_version, no need to install gcc-10."; \ - fi \ - fi \ - && gcc --version && g++ --version +RUN apt-get install -y gcc-10 g++-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 +RUN </dev/null; then \ apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl wget sudo vim; \ + && apt-get install -y ccache software-properties-common git wget sudo vim; \ else \ - dnf install -y git curl wget sudo; \ + dnf install -y git wget sudo; \ fi \ && python3 --version && python3 -m pip --version # Install uv for faster pip installs if not existed RUN --mount=type=cache,target=/root/.cache/uv \ - if ! python3 -m uv --version >/dev/null 2>&1; then \ - python3 -m pip install uv==0.8.4; \ - fi + python3 -m pip install uv==0.8.4 + ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts @@ -98,15 +76,15 @@ ENV UV_LINK_MODE=copy WORKDIR /workspace -# install build and runtime dependencies +# Install build and runtime dependencies COPY requirements/common.txt requirements/common.txt COPY use_existing_torch.py use_existing_torch.py COPY pyproject.toml pyproject.toml -# install build and runtime dependencies without stable torch version +# Install build and runtime dependencies without stable torch version RUN python3 use_existing_torch.py -# default mount file as placeholder, this just avoid the mount error +# Default mount file as placeholder, this just avoid the mount error # change to a different vllm folder if this does not exist anymore ARG TORCH_WHEELS_PATH="./requirements" ARG PINNED_TORCH_VERSION @@ -138,56 +116,36 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/common.txt -# Must put before installing xformers, so it can install the correct version of xfomrers. -ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a' -ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list} - ARG max_jobs=16 ENV MAX_JOBS=${max_jobs} -RUN echo ${TORCH_CUDA_ARCH_LIST} -RUN echo ${MAX_JOBS} -RUN pip freeze | grep -E 'ninja' +RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' + export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a' + git clone https://github.com/facebookresearch/xformers.git -# Build xformers with cuda and torch nightly/wheel -# following official xformers guidance: https://github.com/facebookresearch/xformers#build -# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2 -ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468 -ENV CCACHE_DIR=/root/.cache/ccache + pushd xformers + git checkout v0.0.32.post2 + git submodule update --init --recursive + python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose + popd -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/uv \ - echo 'git clone xformers...' \ - && git clone https://github.com/facebookresearch/xformers.git --recursive \ - && cd xformers \ - && git checkout ${XFORMERS_COMMIT} \ - && git submodule update --init --recursive \ - && echo 'finish git clone xformers...' \ - && rm -rf build \ - && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \ - && cd .. \ - && rm -rf xformers + rm -rf xformers +BASH RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system xformers-dist/*.whl --verbose + uv pip install --system xformers-dist/*.whl -# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. -# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt - RUN cat torch_build_versions.txt RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio' - #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### -# Image used to build vllm wheel FROM base AS build ARG TARGETPLATFORM COPY . . - RUN python3 use_existing_torch.py RUN --mount=type=cache,target=/root/.cache/uv \ @@ -197,20 +155,17 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi -# Max jobs used by Ninja to build extensions ARG max_jobs=16 ENV MAX_JOBS=${max_jobs} -ARG nvcc_threads=4 +ARG nvcc_threads=8 ENV NVCC_THREADS=$nvcc_threads -ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' -ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ARG USE_SCCACHE ARG SCCACHE_BUCKET_NAME=vllm-build-sccache ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 -# if USE_SCCACHE is set, use sccache to speed up compilation +# Use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "$USE_SCCACHE" = "1" ]; then \ @@ -235,6 +190,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && sccache --show-stats; \ fi +ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} + ARG vllm_target_device="cuda" ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache @@ -248,17 +206,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \ fi - -RUN echo "[INFO] Listing current directory:" && \ - ls -al && \ - echo "[INFO] Showing torch_build_versions.txt content:" && \ - cat torch_build_versions.txt - #################### WHEEL BUILD IMAGE #################### ################### VLLM INSTALLED IMAGE #################### -# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer FROM ${FINAL_BASE_IMAGE} AS vllm-base USER root @@ -266,7 +217,7 @@ ARG CUDA_VERSION ARG PYTHON_VERSION ARG GET_PIP_URL -# TODO (huydhn): Only work with PyTorch manylinux builder +# Only work with PyTorch manylinux builder ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" # prepare for environment starts @@ -275,20 +226,19 @@ WORKDIR /workspace # Install Python and other dependencies RUN if command -v apt-get >/dev/null; then \ apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl wget sudo vim \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt-get update -y \ - && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ - && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ - && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ - && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ - && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \ + && apt-get install -y ccache software-properties-common git sudo vim python3-pip; \ else \ - dnf install -y git curl wget sudo; \ + dnf install -y git wget sudo; \ fi \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \ + && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \ + && ln -s /opt/venv/bin/python3 /usr/bin/python3 \ + && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \ + && ln -s /opt/venv/bin/pip /usr/bin/pip \ && python3 --version && python3 -m pip --version -# Get the torch versions, and whls used in previous stagtes for consistency +# Get the torch versions, and whls used in previous stage COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt COPY --from=base /workspace/xformers-dist /wheels/xformers COPY --from=build /workspace/vllm-dist /wheels/vllm @@ -299,19 +249,27 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \ # Install uv for faster pip installs if not existed RUN --mount=type=cache,target=/root/.cache/uv \ - if ! python3 -m uv --version > /dev/null 2>&1; then \ - python3 -m pip install uv==0.8.4; \ - fi + python3 -m pip install uv==0.8.4 + ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# Install build and runtime dependencies, this is needed for flashinfer install +COPY requirements/build.txt requirements/build.txt +COPY use_existing_torch.py use_existing_torch.py +RUN python3 use_existing_torch.py +RUN cat requirements/build.txt + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/build.txt + # Default mount file as placeholder, this just avoid the mount error ARG TORCH_WHEELS_PATH="./requirements" -# Install torch, torchaudio and torchvision -# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt -# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine +# Install torch, torchaudio and torchvision. If TORCH_WHEELS_PATH is default +# to ./requirements, it will pull the nightly versions using pip. Otherwise, +# it will use the local wheels from TORCH_WHEELS_PATH RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \ --mount=type=cache,target=/root/.cache/uv \ if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \ @@ -333,19 +291,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system /wheels/xformers/*.whl --verbose -# Build flashinfer from source. +# Build FlashInfer from source ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0' -# install package for build flashinfer -# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 - -RUN pip install build==1.3.0 -RUN pip freeze | grep -E 'setuptools|packaging|build' - ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -# Build flashinfer for torch nightly from source around 10 mins + ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" -# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt ARG FLASHINFER_GIT_REF="v0.2.14.post1" + RUN --mount=type=cache,target=/root/.cache/uv \ git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ @@ -357,7 +309,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && cd .. \ && rm -rf flashinfer -# install flashinfer python +# Install FlashInfer RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system wheels/flashinfer/*.whl --verbose @@ -367,49 +319,6 @@ RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm ################### VLLM INSTALLED IMAGE #################### -#################### UNITTEST IMAGE ############################# -FROM vllm-base as test - -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" -# Use copy mode to avoid hardlink failures with Docker cache mounts -ENV UV_LINK_MODE=copy - -COPY tests/ tests/ -COPY examples examples -COPY benchmarks benchmarks -COPY ./vllm/collect_env.py . -COPY requirements/common.txt requirements/common.txt -COPY use_existing_torch.py use_existing_torch.py -COPY pyproject.toml pyproject.toml -# Install build and runtime dependencies without stable torch version -COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt - -RUN python3 use_existing_torch.py - -# install packages -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/common.txt -# enable fast downloads from hf (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system hf_transfer -ENV HF_HUB_ENABLE_HF_TRANSFER 1 - -# install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -e tests/vllm_test_utils - -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/nightly_torch_test.txt - -# Logging to confirm the torch versions -RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' - -# Logging to confirm all the packages are installed -RUN pip freeze - -#################### UNITTEST IMAGE ############################# - #################### EXPORT STAGE #################### FROM scratch as export-wheels diff --git a/.github/ci_configs/vllm/use_existing_torch.py b/.github/ci_configs/vllm/use_existing_torch.py index f55db97850d9..3d59fd67a398 100644 --- a/.github/ci_configs/vllm/use_existing_torch.py +++ b/.github/ci_configs/vllm/use_existing_torch.py @@ -1,9 +1,14 @@ import glob +import os requires_files = glob.glob("requirements/*.txt") requires_files += ["pyproject.toml"] + for file in requires_files: + if not os.path.exists(file): + print(f"!!! skipping missing {file}") + continue print(f">>> cleaning {file}") with open(file) as f: lines = f.readlines() diff --git a/.github/labeler.yml b/.github/labeler.yml index 8b1acc77c267..7b47b9fefb5d 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -130,3 +130,35 @@ - torch/csrc/inductor/aoti_include/** - torchgen/aoti/** - torchgen/gen_aoti_c_shim.py + +"ciflow/vllm": +- .github/ci_commit_pins/vllm.txt + +"ciflow/b200": +- test/test_matmul_cuda.py +- test/test_scaled_matmul_cuda.py +- test/inductor/test_fp8.py +- aten/src/ATen/native/cuda/Blas.cpp +- torch/**/*cublas* +- torch/_inductor/kernel/mm.py +- test/inductor/test_max_autotune.py +- third_party/fbgemm + +"ciflow/h100": +- test/test_matmul_cuda.py +- test/test_scaled_matmul_cuda.py +- test/inductor/test_fp8.py +- aten/src/ATen/native/cuda/Blas.cpp +- torch/**/*cublas* +- torch/_inductor/kernel/mm.py +- test/inductor/test_max_autotune.py +- third_party/fbgemm + +"ciflow/rocm": +- test/test_matmul_cuda.py +- test/test_scaled_matmul_cuda.py +- test/inductor/test_fp8.py +- aten/src/ATen/native/cuda/Blas.cpp +- torch/_inductor/kernel/mm.py +- test/inductor/test_max_autotune.py +- third_party/fbgemm diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml index 354381755ce5..e75b80dc4689 100644 --- a/.github/merge_rules.yaml +++ b/.github/merge_rules.yaml @@ -525,6 +525,21 @@ - Lint - pull +- name: typechecking + patterns: + - 'pyrefly.toml' + - 'mypy.ini' + - 'mypy-strict.ini' + approved_by: + - lolpack + - maggiemoss + - ndmitchell + - kinto0 + mandatory_checks_name: + - EasyCLA + - Lint + - pull + - name: superuser patterns: - '*' diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index a0aa6921b92b..74b0d243859a 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,41 +1,48 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 ciflow_push_tags: +- ciflow/b200 +- ciflow/b200-symm-mem +- ciflow/b200-distributed - ciflow/binaries - ciflow/binaries_libtorch - ciflow/binaries_wheel -- ciflow/triton_binaries +- ciflow/h100 +- ciflow/h100-cutlass-backend +- ciflow/h100-distributed +- ciflow/h100-symm-mem - ciflow/inductor -- ciflow/inductor-periodic -- ciflow/inductor-rocm -- ciflow/inductor-perf-test-nightly-rocm -- ciflow/inductor-perf-compare +- ciflow/inductor-cu126 - ciflow/inductor-micro-benchmark - ciflow/inductor-micro-benchmark-cpu-x86 +- ciflow/inductor-perf-compare +- ciflow/inductor-perf-test-nightly-rocm-mi300 +- ciflow/inductor-perf-test-nightly-rocm-mi355 - ciflow/inductor-perf-test-nightly-x86-zen -- ciflow/inductor-cu126 +- ciflow/inductor-periodic +- ciflow/inductor-rocm - ciflow/linux-aarch64 - ciflow/mps - ciflow/nightly +- ciflow/op-benchmark - ciflow/periodic - ciflow/periodic-rocm-mi300 +- ciflow/pull +- ciflow/quantization-periodic +- ciflow/riscv64 - ciflow/rocm - ciflow/rocm-mi300 +- ciflow/rocm-mi355 +- ciflow/rocm-navi31 - ciflow/s390 -- ciflow/riscv64 - ciflow/slow +- ciflow/torchbench +- ciflow/triton_binaries - ciflow/trunk - ciflow/unstable -- ciflow/xpu - ciflow/vllm -- ciflow/torchbench -- ciflow/op-benchmark -- ciflow/pull -- ciflow/h100 -- ciflow/h100-distributed - ciflow/win-arm64 -- ciflow/h100-symm-mem -- ciflow/h100-cutlass-backend +- ciflow/xpu retryable_workflows: - pull - trunk @@ -44,4 +51,4 @@ retryable_workflows: - inductor-A100-perf-nightly labeler_config: labeler.yml label_to_label_config: label_to_label.yml -mergebot: True +mergebot: true diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt deleted file mode 100644 index 5fc26302a0ad..000000000000 --- a/.github/requirements/pip-requirements-macOS.txt +++ /dev/null @@ -1,36 +0,0 @@ -boto3==1.35.42 -cmake==3.27.* -expecttest==0.3.0 -fbscribelogger==0.1.7 -filelock==3.18.0 -hypothesis==6.56.4 -librosa>=0.6.2 -mpmath==1.3.0 -networkx==2.8.7 -ninja==1.10.2.4 -numba==0.59.0 -numpy==1.26.4 -opt-einsum>=3.3 -optree==0.13.0 -packaging==23.1 -parameterized==0.8.1 -pillow==10.3.0 -protobuf==5.29.5 -psutil==5.9.8 -pygments==2.15.0 -pytest-cpp==2.3.0 -pytest-flakefinder==1.1.0 -pytest-rerunfailures==10.3 -pytest-subtests==0.13.1 -pytest-xdist==3.3.1 -pytest==7.3.2 -pyyaml==6.0.2 -scipy==1.12.0 -setuptools==78.1.1 -sympy==1.13.3 -tlparse==0.4.0 -tensorboard==2.13.0 -typing-extensions==4.12.2 -unittest-xml-reporting<=3.2.0,>=2.0.0 -xdoctest==1.1.0 -z3-solver==4.15.1.0 diff --git a/.github/scripts/drci_mocks.json.gz b/.github/scripts/drci_mocks.json.gz index b8c483013694..4e03d0672bdd 100644 Binary files a/.github/scripts/drci_mocks.json.gz and b/.github/scripts/drci_mocks.json.gz differ diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 9ba210a5ed2b..592c7aab6d93 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -502,6 +502,7 @@ def perform_misc_tasks( job_name: str, pr_body: str, branch: Optional[str] = None, + tag: Optional[str] = None, ) -> None: """ In addition to apply the filter logic, the script also does the following @@ -509,7 +510,11 @@ def perform_misc_tasks( """ set_output( "keep-going", - branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"), + branch == MAIN_BRANCH + or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag)) + # Pattern for tags created via manual run on HUD + or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag)) + or check_for_setting(labels, pr_body, "keep-going"), ) set_output( "ci-verbose-test-logs", @@ -634,6 +639,7 @@ def main() -> None: job_name=args.job_name, pr_body=pr_body if pr_body else "", branch=args.branch, + tag=tag, ) # Set the filtered test matrix as the output diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index e57c2d5ef074..154b5a6f0b90 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -16,21 +16,23 @@ # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this -CUDA_ARCHES = ["12.6", "12.8", "13.0"] +CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"] CUDA_STABLE = "12.8" CUDA_ARCHES_FULL_VERSION = { "12.6": "12.6.3", "12.8": "12.8.1", + "12.9": "12.9.1", "13.0": "13.0.0", } CUDA_ARCHES_CUDNN_VERSION = { "12.6": "9", "12.8": "9", + "12.9": "9", "13.0": "9", } # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this -ROCM_ARCHES = ["6.3", "6.4"] +ROCM_ARCHES = ["6.4", "7.0"] XPU_ARCHES = ["xpu"] @@ -38,7 +40,7 @@ CPU_S390X_ARCH = ["cpu-s390x"] -CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"] +CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"] PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { @@ -76,6 +78,23 @@ "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" ), + "12.9": ( + "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | " + "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | " + "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " + "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | " + "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | " + "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | " + "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | " + "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " + "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " + "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | " + "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | " + "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | " + "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'" + ), "13.0": ( "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " @@ -222,7 +241,11 @@ def generate_libtorch_matrix( arches += CUDA_ARCHES arches += ROCM_ARCHES elif os == "windows": - arches += CUDA_ARCHES + # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up + # in 2.10 + windows_cuda_arches = CUDA_ARCHES.copy() + windows_cuda_arches.remove("12.9") + arches += windows_cuda_arches if libtorch_variants is None: libtorch_variants = [ "shared-with-deps", @@ -286,7 +309,11 @@ def generate_wheels_matrix( if os == "linux": arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES elif os == "windows": - arches += CUDA_ARCHES + XPU_ARCHES + # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up + # in 2.10 + windows_cuda_arches = CUDA_ARCHES.copy() + windows_cuda_arches.remove("12.9") + arches += windows_cuda_arches + XPU_ARCHES elif os == "linux-aarch64": # Separate new if as the CPU type is different and # uses different build/test scripts @@ -322,7 +349,7 @@ def generate_wheels_matrix( # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install if ( - arch_version in ["13.0", "12.8", "12.6"] + arch_version in ["13.0", "12.9", "12.8", "12.6"] and os == "linux" or arch_version in CUDA_AARCH64_ARCHES ): @@ -386,5 +413,6 @@ def generate_wheels_matrix( validate_nccl_dep_consistency("13.0") +validate_nccl_dep_consistency("12.9") validate_nccl_dep_consistency("12.8") validate_nccl_dep_consistency("12.6") diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 67906d4ad88d..7d22e5059b7c 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -127,53 +127,6 @@ class OperatingSystem: ), ] -ROCM_SMOKE_WORKFLOWS = [ - BinaryBuildWorkflow( - os=OperatingSystem.LINUX, - package_type="manywheel", - build_variant="rocm", - build_configs=generate_binary_build_matrix.generate_wheels_matrix( - OperatingSystem.LINUX, - arches=["6.4"], - python_versions=["3.9"], - ), - ciflow_config=CIFlowConfig( - labels={ - LABEL_CIFLOW_BINARIES, - LABEL_CIFLOW_BINARIES_WHEEL, - LABEL_CIFLOW_ROCM, - }, - isolated_workflow=True, - ), - branches="main", - ), -] - -LINUX_BINARY_SMOKE_WORKFLOWS = [ - BinaryBuildWorkflow( - os=OperatingSystem.LINUX, - package_type="manywheel", - build_configs=generate_binary_build_matrix.generate_wheels_matrix( - OperatingSystem.LINUX, - arches=["12.8"], - python_versions=["3.12"], - ), - branches="main", - ), - BinaryBuildWorkflow( - os=OperatingSystem.LINUX, - package_type="libtorch", - build_variant=generate_binary_build_matrix.RELEASE, - build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.LINUX, - generate_binary_build_matrix.RELEASE, - arches=["cpu"], - libtorch_variants=["shared-with-deps"], - ), - branches="main", - ), -] - WINDOWS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.WINDOWS, @@ -259,39 +212,6 @@ class OperatingSystem: ), ] -WINDOWS_BINARY_SMOKE_WORKFLOWS = [ - BinaryBuildWorkflow( - os=OperatingSystem.WINDOWS, - package_type="libtorch", - build_variant=generate_binary_build_matrix.RELEASE, - build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.WINDOWS, - generate_binary_build_matrix.RELEASE, - arches=["cpu"], - libtorch_variants=["shared-with-deps"], - ), - branches="main", - ciflow_config=CIFlowConfig( - isolated_workflow=True, - ), - ), - BinaryBuildWorkflow( - os=OperatingSystem.WINDOWS, - package_type="libtorch", - build_variant=generate_binary_build_matrix.DEBUG, - build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.WINDOWS, - generate_binary_build_matrix.DEBUG, - arches=["cpu"], - libtorch_variants=["shared-with-deps"], - ), - branches="main", - ciflow_config=CIFlowConfig( - isolated_workflow=True, - ), - ), -] - MACOS_BINARY_BUILD_WORKFLOWS = [ BinaryBuildWorkflow( os=OperatingSystem.MACOS_ARM64, @@ -372,23 +292,10 @@ def main() -> None: jinja_env.get_template("linux_binary_build_workflow.yml.j2"), S390X_BINARY_BUILD_WORKFLOWS, ), - ( - # Give rocm it's own workflow file - jinja_env.get_template("linux_binary_build_workflow.yml.j2"), - ROCM_SMOKE_WORKFLOWS, - ), - ( - jinja_env.get_template("linux_binary_build_workflow.yml.j2"), - LINUX_BINARY_SMOKE_WORKFLOWS, - ), ( jinja_env.get_template("windows_binary_build_workflow.yml.j2"), WINDOWS_BINARY_BUILD_WORKFLOWS, ), - ( - jinja_env.get_template("windows_binary_build_workflow.yml.j2"), - WINDOWS_BINARY_SMOKE_WORKFLOWS, - ), ( jinja_env.get_template("macos_binary_build_workflow.yml.j2"), MACOS_BINARY_BUILD_WORKFLOWS, diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 0fc60cb31e2a..110015988a5c 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -18,6 +18,7 @@ class GitHubComment: body_text: str created_at: str author_login: str + author_url: Optional[str] author_association: str editor_login: Optional[str] database_id: int diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz index 67355239dc42..70663a01e777 100644 Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py index 15b9d806b302..74ce276c9d10 100644 --- a/.github/scripts/test_check_labels.py +++ b/.github/scripts/test_check_labels.py @@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]: body_text="mock_body_text", created_at="", author_login="", + author_url=None, author_association="", editor_login=None, database_id=1, @@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]: body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""), created_at="", author_login=BOT_AUTHORS[1], + author_url=None, author_association="", editor_login=None, database_id=2, diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index ac3a1cc12921..790deb85ef8c 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -32,6 +32,7 @@ main as trymerge_main, MandatoryChecksMissingError, MergeRule, + PostCommentError, RE_GHSTACK_DESC, read_merge_rules, remove_job_name_suffix, @@ -588,6 +589,23 @@ def test_get_merge_base(self, *args: Any) -> None: self.assertEqual(mock_merge_base, pr.get_merge_base()) mocked_gh_fetch_merge_base.assert_called_once() + def test_app_can_revert(self, *args: Any) -> None: + pr = GitHubPR("pytorch", "pytorch", 164660) + repo = DummyGitRepo() + app_comment_id, impostor_comment_id = 3375785595, 3377647892 + # Check that app can revert + self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id)) + # But impostor can not + self.assertRaises( + PostCommentError, + lambda: validate_revert(repo, pr, comment_id=impostor_comment_id), + ) + # Despite it's name being the name of the bot + self.assertEqual( + pr.get_comment_by_id(impostor_comment_id).author_login, + "pytorch-auto-revert", + ) + @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) @mock.patch("trymerge.gh_fetch_merge_base", return_value="") diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 00b66869dcf2..c258284a00d8 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -234,6 +234,7 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): createdAt author { login + url } authorAssociation editor { @@ -1091,8 +1092,9 @@ def _comment_from_node(node: Any) -> GitHubComment: editor = node["editor"] return GitHubComment( body_text=node["bodyText"], - created_at=node["createdAt"] if "createdAt" in node else "", + created_at=node.get("createdAt", ""), author_login=node["author"]["login"], + author_url=node["author"].get("url", None), author_association=node["authorAssociation"], editor_login=editor["login"] if editor else None, database_id=node["databaseId"], @@ -2029,16 +2031,17 @@ def validate_revert( # For some reason, one can not be a member of private repo, only CONTRIBUTOR if pr.is_base_repo_private(): allowed_reverters.append("CONTRIBUTOR") + # Special case the pytorch-auto-revert app, whose does not have association + # But should be able to issue revert command + if comment.author_url == "https://github.com/apps/pytorch-auto-revert": + allowed_reverters.append("NONE") + if author_association not in allowed_reverters: raise PostCommentError( f"Will not revert as @{author_login} is not one of " f"[{', '.join(allowed_reverters)}], but instead is {author_association}." ) - # Raises exception if matching rule is not found, but ignores all status checks - find_matching_merge_rule( - pr, repo, skip_mandatory_checks=True, skip_internal_checks=True - ) commit_sha = get_pr_commit_sha(repo, pr) return (author_login, commit_sha) diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index fee9ca2eac12..baff04967e3a 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -71,12 +71,15 @@ jobs: with:!{{ upload.binary_env_as_input(config) }} {%- if "aarch64" in build_environment %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" {%- elif "s390x" in build_environment %} runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 + {%- elif config["gpu_arch_type"] == "rocm" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral @@ -174,6 +177,9 @@ jobs: runs-on: linux.rocm.gpu.mi250 timeout-minutes: !{{ common.timeout_minutes }} !{{ upload.binary_env(config) }} + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 7f307447c357..ad5dd74972d0 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -26,9 +26,8 @@ name: !{{ build_environment }} - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 - python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}" + python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}" freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }} {%- endmacro %} diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 2d9e4d0e27b2..476dd182db0f 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -187,8 +187,6 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main - with: - driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }} if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index aba3fa3dceec..ebf96264e994 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -67,12 +67,12 @@ jobs: # an OOM issue when running the job, so this upgrades the runner from 4xlarge # to the next available tier of 12xlarge. So much memory just to generate cpp # doc - runner: ${{ inputs.runner_prefix }}linux.12xlarge + runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) # Let's try to figure out how this can be improved timeout-minutes: 360 - docs_type: python - runner: ${{ inputs.runner_prefix }}linux.2xlarge + runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge # It takes less than 30m to finish python docs unless there are issues timeout-minutes: 30 # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180) diff --git a/.github/workflows/_get-changed-files.yml b/.github/workflows/_get-changed-files.yml index 55712b065270..599d7a3277fe 100644 --- a/.github/workflows/_get-changed-files.yml +++ b/.github/workflows/_get-changed-files.yml @@ -2,6 +2,12 @@ name: Get Changed Files on: workflow_call: + inputs: + all_files: + description: "Whether to return all files instead of just changed files" + required: false + type: boolean + default: false outputs: changed-files: description: "List of changed files (space-separated) or '*' if not in a PR" @@ -26,16 +32,31 @@ jobs: # Get the PR number from the github context PR_NUMBER="${{ github.event.number }}" - # Use gh CLI to get changed files in the PR with explicit repo - CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') + # Check if all_files is requested + if [ "${{ inputs.all_files }}" = "true" ]; then + echo "all_files input is true, returning all files" + echo "changed-files=*" >> "$GITHUB_OUTPUT" + else + # Use gh CLI to get changed files in the PR with explicit repo + CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//') - if [ -z "$CHANGED_FILES" ]; then - echo "No changed files found, setting to '*'" - CHANGED_FILES="*" - fi + # See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790 + PYI_FILES_TO_ADD="" + for file in ${CHANGED_FILES}; do + if [[ "${file}" == *".pyi.in" ]]; then + PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}" + fi + done + CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}" + + if [ -z "$CHANGED_FILES" ]; then + echo "No changed files found, setting to '*'" + CHANGED_FILES="*" + fi - echo "Changed files: $CHANGED_FILES" - echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" + echo "Changed files: $CHANGED_FILES" + echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT" + fi else echo "Not in PR context, setting changed files to '*'" diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 6b4bd429e3c9..cc0064391fde 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -37,7 +37,7 @@ on: runner: required: false type: string - default: "linux.2xlarge" + default: "linux.c7i.2xlarge" description: | Label of the runner this job should run on. test-matrix: diff --git a/.github/workflows/_linux-test-stable-fa3.yml b/.github/workflows/_linux-test-stable-fa3.yml new file mode 100644 index 000000000000..63a9e7359ed2 --- /dev/null +++ b/.github/workflows/_linux-test-stable-fa3.yml @@ -0,0 +1,255 @@ +# The point of this workflow is to test that a FA3 wheel that was built based off the +# stable ABI as of torch nightly 20250830 can still run on the newer torch. +# +# This workflow is very similar to the _linux-test.yml workflow, with the following +# differences: +# 1. It is simpler (there is no test matrix) +# 2. It pulls flash-attention as a secondary repository in order to access the tests. +# Note that it does not BUILD anything from flash-attention, as we have a prebuilt +# wheel. We pull flash-attention only to run a few tests. +# 3. It runs only FA3 tests. No PyTorch tests are run. +name: linux-test-stable-fa3 + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image: + required: true + type: string + description: Docker image to run in. + timeout-minutes: + required: false + type: number + default: 30 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + VLLM_TEST_HUGGING_FACE_TOKEN: + required: false + description: | + HF Auth token to test vllm + SCRIBE_GRAPHQL_ACCESS_TOKEN: + required: false + description: | + FB app token to write to scribe endpoint + +env: + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: linux.aws.h100 + timeout-minutes: ${{ inputs.timeout-minutes || 30 }} + permissions: + id-token: write + contents: read + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + no-sudo: true + + - name: Checkout flash-attention as a secondary repository + uses: actions/checkout@v4 + with: + repository: Dao-AILab/flash-attention + path: flash-attention + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ inputs.docker-image }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*:} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Check if in a container runner + shell: bash + id: check_container_runner + run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" + + - name: Setup GPU_FLAG for docker run + id: setup-gpu-flag + run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" + + - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container + id: setup-sscache-port-flag + run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }} + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + + - name: Set Test step time + id: test-timeout + shell: bash + env: + JOB_TIMEOUT: ${{ inputs.timeout-minutes }} + run: | + echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" + + - name: Preserve github env variables for use in docker + shell: bash + run: | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" + + - name: Test + id: test + timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} + SHM_SIZE: '2g' + DOCKER_IMAGE: ${{ inputs.docker-image }} + VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }} + run: | + set -x + + TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh + + # Leaving 1GB for the runner and other things + TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) + # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap + # comes from https://github.com/pytorch/test-infra/pull/6058 + TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) + + + SHM_OPTS="--shm-size=${SHM_SIZE}" + JENKINS_USER="--user jenkins" + DOCKER_SHELL_CMD= + + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e JOB_NAME \ + -e BASE_SHA \ + -e BRANCH \ + -e SHA1 \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e HUGGING_FACE_HUB_TOKEN \ + -e VLLM_TEST_HUGGING_FACE_TOKEN \ + -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ + -e ARTIFACTS_FILE_SUFFIX \ + --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ + --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + ${SHM_OPTS} \ + --tty \ + --detach \ + --name="${container_name}" \ + ${JENKINS_USER} \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" \ + ${DOCKER_SHELL_CMD} + ) + + echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" + + docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" + + - name: Collect backtraces from coredumps (if any) + if: always() + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Store Core dumps on S3 + uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0 + if: failure() + with: + name: coredumps-fa3-stable-abi-smoke-tests + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Upload utilization stats + if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }} + continue-on-error: true + uses: ./.github/actions/upload-utilization-stats + with: + job_id: ${{ steps.get-job-id.outputs.job-id }} + job_name: ${{ steps.get-job-id.outputs.job-name }} + workflow_name: ${{ github.workflow }} + workflow_run_id: ${{github.run_id}} + workflow_attempt: ${{github.run_attempt}} + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 537e94488b36..29c2fc8e0847 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -224,6 +224,46 @@ jobs: continue-on-error: true uses: ./.github/actions/download-td-artifacts + - name: Download Windows torch wheel for cross-compilation + if: matrix.win_torch_wheel_artifact != '' + uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0 + with: + name: ${{ matrix.win_torch_wheel_artifact }} + path: win-torch-wheel + + - name: Extract Windows wheel and setup CUDA libraries + if: matrix.win_torch_wheel_artifact != '' + shell: bash + run: | + set -x + + # Find the wheel file + WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1) + if [ -z "$WHEEL_FILE" ]; then + echo "Error: No wheel file found in win-torch-wheel directory" + exit 1 + fi + echo "Found wheel file: $WHEEL_FILE" + + # Unzip the wheel file + unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted + echo "Extracted wheel contents" + + # Setup CUDA libraries (cuda.lib and cudart.lib) directory + mkdir -p win-torch-wheel-extracted/lib/x64 + if [ -f "win-torch-wheel/cuda.lib" ]; then + mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/ + echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/" + fi + if [ -f "win-torch-wheel/cudart.lib" ]; then + mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/ + echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/" + fi + + # Verify CUDA libraries are present + echo "CUDA libraries:" + ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found" + - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -273,6 +313,8 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} + EXTRA_FLAGS: ${{ matrix.extra_flags || '' }} + OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} @@ -387,8 +429,6 @@ jobs: "${DOCKER_IMAGE}" \ ${DOCKER_SHELL_CMD} ) - # Propagate download.pytorch.org IP to container - grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index a2a5f8dd9111..24fe510f0fb5 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -85,7 +85,7 @@ jobs: uses: pytorch/test-infra/.github/actions/setup-python@main with: python-version: ${{ inputs.python-version }} - pip-requirements-file: .github/requirements/pip-requirements-macOS.txt + pip-requirements-file: .ci/docker/requirements-ci.txt - name: Install sccache (only for non-forked PRs, and pushes to trunk) uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 086e25b4868e..82eb3c4bf2c7 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -122,7 +122,7 @@ jobs: uses: pytorch/test-infra/.github/actions/setup-python@main with: python-version: ${{ inputs.python-version }} - pip-requirements-file: .github/requirements/pip-requirements-macOS.txt + pip-requirements-file: .ci/docker/requirements-ci.txt - name: Start monitoring script id: monitor-script diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 7781e1f65fd1..43ed76a63cc6 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -102,19 +102,6 @@ jobs: exit 1 fi - - name: configure aws credentials - id: aws_creds - uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - - name: Login to Amazon ECR - id: login-ecr - continue-on-error: true - uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 - - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 7067d79eb075..0fd3cf7f3972 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -84,9 +84,6 @@ jobs: # in https://github.com/actions/checkout/issues/1018 git config --global core.fsmonitor false - - name: Clean up leftover processes on non-ephemeral Windows runner - uses: pytorch/test-infra/.github/actions/cleanup-runner@main - - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main with: @@ -151,7 +148,7 @@ jobs: BUILD_WHEEL: 1 MAX_JOBS: 8 CUDA_VERSION: ${{ inputs.cuda-version }} - PYTHON_VERSION: "3.9" + PYTHON_VERSION: "3.10" SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_REGION: us-east-1 @@ -171,6 +168,31 @@ jobs: run: | .ci/pytorch/win-build.sh + # Collect Windows torch libs and CUDA libs for cross-compilation + - name: Collect Windows CUDA libs for cross-compilation + if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu' + shell: bash + run: | + set -ex + + # Create directory structure if does not exist + mkdir -p /c/${{ github.run_id }}/build-results + + # Copy CUDA libs + CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}" + + if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then + cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/ + fi + + if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then + cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/ + fi + + # List collected files + echo "Collected CUDA libs:" + ls -lah /c/${{ github.run_id }}/build-results/*.lib + # Upload to github so that people can click and download artifacts - name: Upload artifacts to s3 if: steps.build.outcome != 'skipped' diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 5049ef61f693..3d2fe8a4b3fa 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -77,9 +77,6 @@ jobs: # in https://github.com/actions/checkout/issues/1018 git config --global core.fsmonitor false - - name: Clean up leftover processes on non-ephemeral Windows runner - uses: pytorch/test-infra/.github/actions/cleanup-runner@main - - name: Setup SSH (Click me for login details) uses: pytorch/test-infra/.github/actions/setup-ssh@main with: @@ -106,18 +103,6 @@ jobs: with: cuda-version: ${{ inputs.cuda-version }} - # TODO: Move to a requirements.txt file for windows - - name: Install pip dependencies - uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0 - with: - shell: bash - timeout_minutes: 5 - max_attempts: 5 - retry_wait_seconds: 30 - command: | - set -eu - python3 -m pip install 'xdoctest>=1.1.0' - - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -184,7 +169,7 @@ jobs: env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: 3.9 + PYTHON_VERSION: "3.10" CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} @@ -272,15 +257,6 @@ jobs: shell: bash run: python3 .github/scripts/parse_ref.py - - name: Uninstall PyTorch - if: always() - continue-on-error: true - shell: bash - run: | - # This step removes PyTorch installed by the test to give a clean slate - # to the next job - python3 -mpip uninstall -y torch - - name: Teardown Windows uses: ./.github/actions/teardown-win if: always() diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml new file mode 100644 index 000000000000..596a31431e61 --- /dev/null +++ b/.github/workflows/b200-distributed.yml @@ -0,0 +1,62 @@ +name: CI for distributed tests on B200 + +on: + pull_request: + paths: + - .github/workflows/b200-distributed.yml + workflow_dispatch: + push: + tags: + - ciflow/b200-distributed/* + schedule: + - cron: 46 8 * * * # about 1:46am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200: + name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200: + name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200 + with: + timeout-minutes: 1200 + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit diff --git a/.github/workflows/b200-symm-mem.yml b/.github/workflows/b200-symm-mem.yml new file mode 100644 index 000000000000..7fa8a8a73044 --- /dev/null +++ b/.github/workflows/b200-symm-mem.yml @@ -0,0 +1,60 @@ +name: Limited CI for symmetric memory tests on B200 + +on: + pull_request: + paths: + - .github/workflows/b200-symm-mem.yml + workflow_dispatch: + push: + tags: + - ciflow/b200-symm-mem/* + schedule: + - cron: 22 8 * * * # about 1:22am PDT + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index 0754b154a358..8318286cccbe 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -36,7 +36,7 @@ jobs: runs-on: linux.9xlarge.ephemeral strategy: matrix: - tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"] + tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"] steps: - name: Build docker image uses: pytorch/pytorch/.github/actions/binary-docker-build@main diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index cc2f54fc45f8..c67281e0a112 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -52,8 +52,8 @@ jobs: { tag: "cuda12.9" }, { tag: "cuda12.8" }, { tag: "cuda12.6" }, - { tag: "rocm6.3" }, { tag: "rocm6.4" }, + { tag: "rocm7.0" }, { tag: "cpu" }, ] steps: diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml index b6eb09188fd4..eaeb741e5639 100644 --- a/.github/workflows/build-magma-rocm-linux.yml +++ b/.github/workflows/build-magma-rocm-linux.yml @@ -34,7 +34,7 @@ jobs: id-token: write strategy: matrix: - rocm_version: ["64", "63"] + rocm_version: ["70", "64"] steps: - name: Checkout PyTorch uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 9d08501f51bc..a5c5c387adb8 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -46,17 +46,18 @@ jobs: fail-fast: false matrix: include: [ - { name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" }, - { name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, + { name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" }, - { name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" }, ] runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }} diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 932d9c886302..9e4144ae56c2 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -50,12 +50,12 @@ jobs: strategy: fail-fast: false matrix: - py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] + py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] device: ["cuda", "rocm", "xpu", "aarch64"] docker-image: ["pytorch/manylinux2_28-builder:cpu"] include: - device: "rocm" - rocm_version: "6.4" + rocm_version: "7.0" runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" - device: "cuda" rocm_version: "" @@ -108,9 +108,6 @@ jobs: # Determine python executable for given version case $PY_VERS in - 3.9) - PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python - ;; 3.10) PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python ;; @@ -194,7 +191,7 @@ jobs: strategy: fail-fast: false matrix: - py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] + py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ] device: ["xpu"] timeout-minutes: 40 env: diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml index 2c6635374841..4526faf6d7fc 100644 --- a/.github/workflows/build-vllm-wheel.yml +++ b/.github/workflows/build-vllm-wheel.yml @@ -27,9 +27,8 @@ jobs: fail-fast: false matrix: python-version: [ '3.12' ] - # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] - device: [ 'cu128', 'cu129' ] + device: [ 'cu128', 'cu129', 'cu130' ] include: - platform: manylinux_2_28_x86_64 device: cu128 @@ -39,6 +38,10 @@ jobs: device: cu129 manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9' runner: linux.12xlarge.memory + - platform: manylinux_2_28_x86_64 + device: cu130 + manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0' + runner: linux.12xlarge.memory - platform: manylinux_2_28_aarch64 device: cu128 manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8' @@ -47,6 +50,11 @@ jobs: device: cu129 manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9' runner: linux.arm64.r7g.12xlarge.memory + exclude: + # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and + # xformers is update to support 13.0 + - platform: manylinux_2_28_aarch64 + device: cu130 name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}" runs-on: ${{ matrix.runner }} timeout-minutes: 480 @@ -169,7 +177,12 @@ jobs: fail-fast: false matrix: platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ] - device: [ 'cu128', 'cu129' ] + device: [ 'cu128', 'cu129', 'cu130' ] + exclude: + # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and + # xformers is update to support 13.0 + - platform: manylinux_2_28_aarch64 + device: cu130 env: PLATFORM: ${{ matrix.platform }} BUILD_DEVICE: ${{ matrix.device }} diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 57fe7be15d29..d5e0d96fe19f 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -35,6 +35,7 @@ jobs: contents: write outputs: pt_release_name: ${{ steps.release_name.outputs.pt_release_name }} + pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }} steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -53,8 +54,12 @@ jobs: tag_or_branch="${tag_or_branch#refs/heads/}" # replace directory separators with _ in branch name tag_or_branch="${tag_or_branch//\//_}" - echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV" - echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV" + torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')" + { + echo "PT_RELEASE_NAME=pytorch-$tag_or_branch"; + echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz"; + echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz"; + } >> "$GITHUB_ENV" - name: Checkout optional submodules run: python3 tools/optional_submodules.py - name: Copy docs requirements for inclusion @@ -64,30 +69,47 @@ jobs: cp .ci/docker/requirements-docs.txt docs/requirements.txt - name: Create source distribution run: | - # Create new folder with specified name so extracting the archive yields that - rm -rf "/tmp/$PT_RELEASE_NAME" - cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" - mv "/tmp/$PT_RELEASE_NAME" . - # Cleanup - rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} - find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true - # Create archive - tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" - echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" + # Create new folder with specified name so extracting the archive yields that + rm -rf "/tmp/$PT_RELEASE_NAME" + cp -r "$PWD" "/tmp/$PT_RELEASE_NAME" + mv "/tmp/$PT_RELEASE_NAME" . + # Cleanup + rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci} + find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true + # Create archive + tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME" + echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")" + - name: Create PEP 517 compatible source distribution + run: | + pip install build==1.2.2.post1 || exit 1 + python -m build --sdist || exit 1 + cd dist || exit 1 - name: Upload source distribution for release if: ${{ github.event_name == 'release' }} uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2 with: - files: ${{env.PT_RELEASE_FILE}} - - name: Upload source distribution to GHA artifacts for release tags + files: | + ${{ env.PT_RELEASE_FILE }} + ${{ env.PT_PEP517_RELEASE_FILE }} + - name: Upload source distribution to GHA artifacts # for release tags if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ env.PT_RELEASE_FILE }} path: ${{ env.PT_RELEASE_FILE }} + - name: Upload PEP 517 source distribution to GHA artifacts # for release tags + if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + with: + name: ${{ env.PT_PEP517_RELEASE_FILE }} + path: dist/${{ env.PT_PEP517_RELEASE_FILE }} - name: Set output id: release_name - run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}" + run: | + { + echo "pt_release_name=${{ env.PT_RELEASE_FILE }}"; + echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}"; + } >> "${GITHUB_OUTPUT}" upload_source_code_to_s3: if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }} @@ -103,6 +125,9 @@ jobs: - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 with: name: ${{ needs.release.outputs.pt_release_name }} + - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: ${{ needs.release.outputs.pt_pep517_release_name }} - name: Configure AWS credentials(PyTorch account) uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 with: @@ -113,7 +138,9 @@ jobs: s3-bucket: pytorch s3-prefix: source_code/test if-no-files-found: warn - path: ${{ needs.release.outputs.pt_release_name }} + path: | + ${{ needs.release.outputs.pt_release_name }} + ${{ needs.release.outputs.pt_pep517_release_name }} concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }} diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 492f41775d9d..ca257ee8225a 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -59,7 +59,6 @@ jobs: pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-rocm-n-py3, pytorch-linux-noble-rocm-n-py3, - pytorch-linux-noble-rocm-alpha-py3, pytorch-linux-jammy-rocm-n-py3-benchmarks, pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12, pytorch-linux-jammy-py3.10-gcc11, @@ -70,9 +69,8 @@ jobs: pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, - # Executorch pin needs update - # pytorch-linux-jammy-py3-clang12-executorch, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, + pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu, pytorch-linux-noble-riscv64-py3.12-gcc14 ] diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 8a3c0840f843..fd31e4819bb9 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -62,7 +62,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -128,7 +128,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -174,7 +174,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -204,6 +204,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_10-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_10-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -220,7 +266,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -265,7 +311,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -331,7 +377,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -377,7 +423,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -407,6 +453,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_11-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_11-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -423,7 +515,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -468,7 +560,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -534,7 +626,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -580,7 +672,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -610,6 +702,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_12-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_12-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -626,7 +764,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -671,7 +809,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -737,7 +875,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -783,7 +921,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -813,6 +951,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -829,7 +1013,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -874,7 +1058,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -940,7 +1124,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -986,7 +1170,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -1016,6 +1200,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13t-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13t-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1032,7 +1262,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_13t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -1077,7 +1307,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1143,7 +1373,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -1189,7 +1419,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -1219,6 +1449,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1235,7 +1511,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel @@ -1280,7 +1556,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel @@ -1346,7 +1622,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel @@ -1392,7 +1668,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel @@ -1422,6 +1698,52 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14t-cuda-aarch64-12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.r7g.12xlarge.memory + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-12_9 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-12_9-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14t-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1438,7 +1760,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.arm64.m7g.4xlarge.ephemeral + runs_on: linux.arm64.r7g.12xlarge.memory ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 03835a9f5f35..7f3277ef64a1 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -248,6 +248,74 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + libtorch-cuda12_9-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: libtorch-cuda12_9-shared-with-deps-release + build_environment: linux-binary-libtorch + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_9-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - libtorch-cuda12_9-shared-with-deps-release-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_9-shared-with-deps-release + build_environment: linux-binary-libtorch + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_9-shared-with-deps-release-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: libtorch-cxx11-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + build_name: libtorch-cuda12_9-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -316,7 +384,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm6_3-shared-with-deps-release-build: + libtorch-rocm6_4-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -325,22 +393,23 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: libtorch-rocm6_3-shared-with-deps-release + timeout-minutes: 300 + build_name: libtorch-rocm6_4-shared-with-deps-release build_environment: linux-binary-libtorch secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm6_3-shared-with-deps-release-test: # Testing + libtorch-rocm6_4-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-rocm6_3-shared-with-deps-release-build + - libtorch-rocm6_4-shared-with-deps-release-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -349,21 +418,24 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-rocm6_3-shared-with-deps-release + name: libtorch-rocm6_4-shared-with-deps-release path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -394,7 +466,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -407,30 +479,30 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading + libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-rocm6_3-shared-with-deps-release-test + needs: libtorch-rocm6_4-shared-with-deps-release-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-rocm6_3-shared-with-deps-release + build_name: libtorch-rocm6_4-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm6_4-shared-with-deps-release-build: + libtorch-rocm7_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -439,22 +511,23 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: libtorch-rocm6_4-shared-with-deps-release + timeout-minutes: 300 + build_name: libtorch-rocm7_0-shared-with-deps-release build_environment: linux-binary-libtorch secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm6_4-shared-with-deps-release-test: # Testing + libtorch-rocm7_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-rocm6_4-shared-with-deps-release-build + - libtorch-rocm7_0-shared-with-deps-release-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -463,21 +536,24 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-rocm6_4-shared-with-deps-release + name: libtorch-rocm7_0-shared-with-deps-release path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -508,7 +584,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -521,25 +597,25 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading + libtorch-rocm7_0-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-rocm6_4-shared-with-deps-release-test + needs: libtorch-rocm7_0-shared-with-deps-release-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-rocm6_4-shared-with-deps-release + build_name: libtorch-rocm7_0-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml deleted file mode 100644 index c98d71dfefc4..000000000000 --- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml +++ /dev/null @@ -1,87 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-libtorch-release - - -on: - push: - branches: - - main - tags: - - 'ciflow/trunk/*' - workflow_dispatch: - -permissions: - id-token: write - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-libtorch-release - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 0 -concurrency: - group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - get-label-type: - if: github.repository_owner == 'pytorch' - name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main - with: - triggering_actor: ${{ github.triggering_actor }} - issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} - curr_branch: ${{ github.head_ref || github.ref_name }} - curr_ref_type: ${{ github.ref_type }} - libtorch-cpu-shared-with-deps-release-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cpu - LIBTORCH_CONFIG: release - LIBTORCH_VARIANT: shared-with-deps - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: libtorch-cpu-shared-with-deps-release - build_environment: linux-binary-libtorch-release - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-cpu-shared-with-deps-release-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - libtorch-cpu-shared-with-deps-release-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cpu - LIBTORCH_CONFIG: release - LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-cpu-shared-with-deps-release - build_environment: linux-binary-libtorch-release - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml deleted file mode 100644 index 96b9f9f739f7..000000000000 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ /dev/null @@ -1,88 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-manywheel - - -on: - push: - branches: - - main - tags: - - 'ciflow/trunk/*' - workflow_dispatch: - -permissions: - id-token: write - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-manywheel - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 0 -concurrency: - group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - get-label-type: - if: github.repository_owner == 'pytorch' - name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main - with: - triggering_actor: ${{ github.triggering_actor }} - issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} - curr_branch: ${{ github.head_ref || github.ref_name }} - curr_ref_type: ${{ github.ref_type }} - manywheel-py3_12-cuda12_8-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: "12.8" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - DESIRED_PYTHON: "3.12" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_12-cuda12_8 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda12_8-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_12-cuda12_8-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: "12.8" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 - DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda12_8 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 0f87f97df694..a4a1e3cea95c 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -241,6 +241,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_10-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_10-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_10-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_10-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -307,7 +373,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-rocm6_3-build: + manywheel-py3_10-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -316,21 +382,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_10-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-rocm6_3-test: # Testing + manywheel-py3_10-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_10-rocm6_3-build + - manywheel-py3_10-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -339,20 +406,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_10-rocm6_3 + name: manywheel-py3_10-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -383,7 +453,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -396,29 +466,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_10-rocm6_3-upload: # Uploading + manywheel-py3_10-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-rocm6_3-test + needs: manywheel-py3_10-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm6_3 + build_name: manywheel-py3_10-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-rocm6_4-build: + manywheel-py3_10-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -427,21 +497,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_10-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_10-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-rocm6_4-test: # Testing + manywheel-py3_10-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_10-rocm6_4-build + - manywheel-py3_10-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -450,20 +521,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.10" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_10-rocm6_4 + name: manywheel-py3_10-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -494,7 +568,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -507,24 +581,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_10-rocm6_4-upload: # Uploading + manywheel-py3_10-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-rocm6_4-test + needs: manywheel-py3_10-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm6_4 + build_name: manywheel-py3_10-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -833,6 +907,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_11-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_11-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_11-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -899,7 +1039,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-rocm6_3-build: + manywheel-py3_11-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -908,21 +1048,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_11-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_11-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-rocm6_3-test: # Testing + manywheel-py3_11-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_11-rocm6_3-build + - manywheel-py3_11-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -931,20 +1072,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_11-rocm6_3 + name: manywheel-py3_11-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -975,7 +1119,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -988,29 +1132,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_11-rocm6_3-upload: # Uploading + manywheel-py3_11-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-rocm6_3-test + needs: manywheel-py3_11-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm6_3 + build_name: manywheel-py3_11-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-rocm6_4-build: + manywheel-py3_11-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1019,21 +1163,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_11-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_11-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-rocm6_4-test: # Testing + manywheel-py3_11-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_11-rocm6_4-build + - manywheel-py3_11-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -1042,20 +1187,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.11" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_11-rocm6_4 + name: manywheel-py3_11-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -1086,7 +1234,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -1099,24 +1247,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_11-rocm6_4-upload: # Uploading + manywheel-py3_11-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-rocm6_4-test + needs: manywheel-py3_11-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm6_4 + build_name: manywheel-py3_11-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -1425,6 +1573,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_12-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_12-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_12-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1491,7 +1705,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-rocm6_3-build: + manywheel-py3_12-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1500,21 +1714,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_12-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_12-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-rocm6_3-test: # Testing + manywheel-py3_12-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_12-rocm6_3-build + - manywheel-py3_12-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -1523,20 +1738,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_12-rocm6_3 + name: manywheel-py3_12-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -1567,7 +1785,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -1580,29 +1798,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_12-rocm6_3-upload: # Uploading + manywheel-py3_12-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-rocm6_3-test + needs: manywheel-py3_12-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm6_3 + build_name: manywheel-py3_12-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-rocm6_4-build: + manywheel-py3_12-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -1611,21 +1829,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_12-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_12-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-rocm6_4-test: # Testing + manywheel-py3_12-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_12-rocm6_4-build + - manywheel-py3_12-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -1634,20 +1853,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.12" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_12-rocm6_4 + name: manywheel-py3_12-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -1678,7 +1900,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -1691,24 +1913,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_12-rocm6_4-upload: # Uploading + manywheel-py3_12-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-rocm6_4-test + needs: manywheel-py3_12-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm6_4 + build_name: manywheel-py3_12-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2017,6 +2239,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2083,7 +2371,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-rocm6_3-build: + manywheel-py3_13-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2092,21 +2380,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_13-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-rocm6_3-test: # Testing + manywheel-py3_13-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13-rocm6_3-build + - manywheel-py3_13-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -2115,20 +2404,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_13-rocm6_3 + name: manywheel-py3_13-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -2159,7 +2451,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -2172,29 +2464,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_13-rocm6_3-upload: # Uploading + manywheel-py3_13-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-rocm6_3-test + needs: manywheel-py3_13-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-rocm6_3 + build_name: manywheel-py3_13-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-rocm6_4-build: + manywheel-py3_13-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2203,21 +2495,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_13-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-rocm6_4-test: # Testing + manywheel-py3_13-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13-rocm6_4-build + - manywheel-py3_13-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -2226,20 +2519,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_13-rocm6_4 + name: manywheel-py3_13-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -2270,7 +2566,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -2283,24 +2579,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_13-rocm6_4-upload: # Uploading + manywheel-py3_13-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-rocm6_4-test + needs: manywheel-py3_13-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-rocm6_4 + build_name: manywheel-py3_13-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2515,35 +2811,101 @@ jobs: DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_6 + build_name: manywheel-py3_13t-cuda12_6 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda12_6-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_13t-cuda12_8 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda12_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-cuda12_8-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.4xlarge.nvidia.gpu # 12.6 build can use maxwell (sm_50) runner + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_6-upload: # Uploading + manywheel-py3_13t-cuda12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cuda12_6-test + needs: manywheel-py3_13t-cuda12_8-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu126 - GPU_ARCH_VERSION: "12.6" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_6 + build_name: manywheel-py3_13t-cuda12_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda12_8-build: + manywheel-py3_13t-cuda12_9-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2552,22 +2914,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: "12.8" + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13t-cuda12_8 + build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_8-test: # Testing + manywheel-py3_13t-cuda12_9-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13t-cuda12_8-build + - manywheel-py3_13t-cuda12_9-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -2575,36 +2937,36 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: "12.8" + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_8 + build_name: manywheel-py3_13t-cuda12_9 build_environment: linux-binary-manywheel runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_8-upload: # Uploading + manywheel-py3_13t-cuda12_9-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cuda12_8-test + needs: manywheel-py3_13t-cuda12_9-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu128 - GPU_ARCH_VERSION: "12.8" + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_8 + build_name: manywheel-py3_13t-cuda12_9 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2675,7 +3037,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-rocm6_3-build: + manywheel-py3_13t-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2684,21 +3046,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13t-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_13t-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-rocm6_3-test: # Testing + manywheel-py3_13t-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13t-rocm6_3-build + - manywheel-py3_13t-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -2707,20 +3070,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_13t-rocm6_3 + name: manywheel-py3_13t-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -2751,7 +3117,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -2764,29 +3130,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_13t-rocm6_3-upload: # Uploading + manywheel-py3_13t-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-rocm6_3-test + needs: manywheel-py3_13t-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-rocm6_3 + build_name: manywheel-py3_13t-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-rocm6_4-build: + manywheel-py3_13t-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -2795,21 +3161,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13t-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_13t-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-rocm6_4-test: # Testing + manywheel-py3_13t-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_13t-rocm6_4-build + - manywheel-py3_13t-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -2818,20 +3185,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13t" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_13t-rocm6_4 + name: manywheel-py3_13t-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -2862,7 +3232,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -2875,24 +3245,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_13t-rocm6_4-upload: # Uploading + manywheel-py3_13t-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-rocm6_4-test + needs: manywheel-py3_13t-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-rocm6_4 + build_name: manywheel-py3_13t-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3201,6 +3571,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_14-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3267,7 +3703,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-rocm6_3-build: + manywheel-py3_14-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3276,21 +3712,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_14-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-rocm6_3-test: # Testing + manywheel-py3_14-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14-rocm6_3-build + - manywheel-py3_14-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -3299,20 +3736,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_14-rocm6_3 + name: manywheel-py3_14-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -3343,7 +3783,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -3356,29 +3796,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_14-rocm6_3-upload: # Uploading + manywheel-py3_14-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14-rocm6_3-test + needs: manywheel-py3_14-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-rocm6_3 + build_name: manywheel-py3_14-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-rocm6_4-build: + manywheel-py3_14-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3387,21 +3827,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_14-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-rocm6_4-test: # Testing + manywheel-py3_14-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14-rocm6_4-build + - manywheel-py3_14-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -3410,20 +3851,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_14-rocm6_4 + name: manywheel-py3_14-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -3454,7 +3898,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -3467,24 +3911,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_14-rocm6_4-upload: # Uploading + manywheel-py3_14-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14-rocm6_4-test + needs: manywheel-py3_14-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-rocm6_4 + build_name: manywheel-py3_14-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3793,6 +4237,72 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14t-cuda12_9-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build_name: manywheel-py3_14t-cuda12_9 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda12_9-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14t-cuda12_9-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda12_9 + build_environment: linux-binary-manywheel + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda12_9-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda12_9-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu129 + GPU_ARCH_VERSION: "12.9" + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: manylinux2_28-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda12_9 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + manywheel-py3_14t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3859,7 +4369,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-rocm6_3-build: + manywheel-py3_14t-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3868,21 +4378,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14t-rocm6_3 + timeout-minutes: 300 + build_name: manywheel-py3_14t-rocm6_4 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-rocm6_3-test: # Testing + manywheel-py3_14t-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14t-rocm6_3-build + - manywheel-py3_14t-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -3891,20 +4402,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_14t-rocm6_3 + name: manywheel-py3_14t-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -3935,7 +4449,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.3 + custom-tag-prefix: rocm6.4 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -3948,29 +4462,29 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_14t-rocm6_3-upload: # Uploading + manywheel-py3_14t-rocm6_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14t-rocm6_3-test + needs: manywheel-py3_14t-rocm6_4-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.3 - GPU_ARCH_VERSION: "6.3" + DESIRED_CUDA: rocm6.4 + GPU_ARCH_VERSION: "6.4" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.3 + DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-rocm6_3 + build_name: manywheel-py3_14t-rocm6_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-rocm6_4-build: + manywheel-py3_14t-rocm7_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -3979,21 +4493,22 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14t-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_14t-rocm7_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-rocm6_4-test: # Testing + manywheel-py3_14t-rocm7_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_14t-rocm6_4-build + - manywheel-py3_14t-rocm7_0-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -4002,20 +4517,23 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14t" + permissions: + id-token: write + contents: read steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_14t-rocm6_4 + name: manywheel-py3_14t-rocm7_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 @@ -4046,7 +4564,7 @@ jobs: with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 + custom-tag-prefix: rocm7.0 docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image @@ -4059,24 +4577,24 @@ jobs: DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_14t-rocm6_4-upload: # Uploading + manywheel-py3_14t-rocm7_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14t-rocm6_4-test + needs: manywheel-py3_14t-rocm7_0-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" + DESIRED_CUDA: rocm7.0 + GPU_ARCH_VERSION: "7.0" GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 + DOCKER_IMAGE_TAG_PREFIX: rocm7.0 DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-rocm6_4 + build_name: manywheel-py3_14t-rocm7_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml deleted file mode 100644 index 8177bac3fe21..000000000000 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ /dev/null @@ -1,135 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/linux_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: linux-binary-manywheel-rocm - - -on: - push: - branches: - - main - tags: - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - - 'ciflow/rocm/*' - workflow_dispatch: - -permissions: - id-token: write - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - BINARY_ENV_FILE: /tmp/env - BUILD_ENVIRONMENT: linux-binary-manywheel-rocm - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PYTORCH_FINAL_PACKAGE_DIR: /artifacts - PYTORCH_ROOT: /pytorch - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 0 -concurrency: - group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - get-label-type: - if: github.repository_owner == 'pytorch' - name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main - with: - triggering_actor: ${{ github.triggering_actor }} - issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} - curr_branch: ${{ github.head_ref || github.ref_name }} - curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-rocm6_4-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" - GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-rocm6_4 - build_environment: linux-binary-manywheel-rocm - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm6_4-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_9-rocm6_4-build - - get-label-type - runs-on: linux.rocm.gpu.mi250 - timeout-minutes: 240 - env: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm6.4 - GPU_ARCH_VERSION: "6.4" - GPU_ARCH_TYPE: rocm - SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" - steps: - - name: Setup ROCm - uses: ./.github/actions/setup-rocm - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: manywheel-py3_9-rocm6_4 - path: "${{ runner.temp }}/artifacts/" - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: ROCm set GPU_FLAG - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - - name: configure aws credentials - id: aws_creds - if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }} - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only - aws-region: us-east-1 - role-duration-seconds: 18000 - - name: Calculate docker image - id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main - with: - docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} - docker-image-name: manylinux2_28-builder - custom-tag-prefix: rocm6.4 - docker-build-dir: .ci/docker - working-directory: pytorch - - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main - with: - docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Test Pytorch binary - uses: ./pytorch/.github/actions/test-pytorch-binary - env: - DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} - - name: Teardown ROCm - uses: ./.github/actions/teardown-rocm diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index cd912650eb17..109e98cd9d91 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -63,7 +63,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.10.4" freethreaded: false diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index 8522d2d36993..afe9330deb83 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -59,7 +59,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.10.4" freethreaded: false @@ -169,7 +168,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.11.4" freethreaded: false @@ -279,7 +277,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.12.4" freethreaded: false @@ -389,7 +386,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.13.4" freethreaded: false @@ -499,7 +495,6 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 python-version: "3.13.4" freethreaded: true @@ -609,9 +604,8 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 - python-version: "3.14.0-rc.2" + python-version: "3.14.0" freethreaded: false - name: Checkout PyTorch uses: actions/checkout@v4 @@ -719,9 +713,8 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - # TODO: Removeme once 3.14 is out # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3 - python-version: "3.14.0-rc.2" + python-version: "3.14.0" freethreaded: true - name: Checkout PyTorch uses: actions/checkout@v4 diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml deleted file mode 100644 index 818d2ca45cc4..000000000000 --- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml +++ /dev/null @@ -1,261 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: windows-binary-libtorch-debug - -on: - push: - branches: - - main - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: windows-binary-libtorch-debug - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 - OS: windows -concurrency: - group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - get-label-type: - if: github.repository_owner == 'pytorch' - name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main - with: - triggering_actor: ${{ github.triggering_actor }} - issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} - curr_branch: ${{ github.head_ref || github.ref_name }} - curr_ref_type: ${{ github.ref_type }} - libtorch-cpu-shared-with-deps-debug-build: - if: ${{ github.repository_owner == 'pytorch' }} - needs: get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 360 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_CONFIG: debug - LIBTORCH_VARIANT: shared-with-deps - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.10" - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - echo "system info $(uname -a)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main - continue-on-error: true - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon - shell: bash - run: | - git config --global core.longpaths true - git config --global core.symlinks true - - # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock - # the directory on Windows and prevent GHA from checking out as reported - # in https://github.com/actions/checkout/issues/1018 - git config --global core.fsmonitor false - # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 - - name: Enable long paths on Windows - shell: powershell - run: | - Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 - # Since it's just a defensive command, the workflow should continue even the command fails. This step can be - # removed once Windows Defender is removed from the AMI - - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch - continue-on-error: true - shell: powershell - run: | - Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore - # Let's both exclude the path and disable Windows Defender completely just to be sure - # that it doesn't interfere - Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: actions/upload-artifact@v4.4.0 - if: always() - with: - name: libtorch-cpu-shared-with-deps-debug - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - libtorch-cpu-shared-with-deps-debug-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - libtorch-cpu-shared-with-deps-debug-build - - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 360 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_CONFIG: debug - LIBTORCH_VARIANT: shared-with-deps - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - echo "system info $(uname -a)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main - continue-on-error: true - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon - shell: bash - run: | - git config --global core.longpaths true - git config --global core.symlinks true - - # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock - # the directory on Windows and prevent GHA from checking out as reported - # in https://github.com/actions/checkout/issues/1018 - git config --global core.fsmonitor false - # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 - - name: Enable long paths on Windows - shell: powershell - run: | - Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 - # Since it's just a defensive command, the workflow should continue even the command fails. This step can be - # removed once Windows Defender is removed from the AMI - - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch - continue-on-error: true - shell: powershell - run: | - Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore - # Let's both exclude the path and disable Windows Defender completely just to be sure - # that it doesn't interfere - Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-debug - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml deleted file mode 100644 index ff8a2bbbfe1e..000000000000 --- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml +++ /dev/null @@ -1,261 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/windows_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: windows-binary-libtorch-release - -on: - push: - branches: - - main - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: windows-binary-libtorch-release - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 - OS: windows -concurrency: - group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - get-label-type: - if: github.repository_owner == 'pytorch' - name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main - with: - triggering_actor: ${{ github.triggering_actor }} - issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} - curr_branch: ${{ github.head_ref || github.ref_name }} - curr_ref_type: ${{ github.ref_type }} - libtorch-cpu-shared-with-deps-release-build: - if: ${{ github.repository_owner == 'pytorch' }} - needs: get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 360 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_CONFIG: release - LIBTORCH_VARIANT: shared-with-deps - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.10" - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - echo "system info $(uname -a)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main - continue-on-error: true - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon - shell: bash - run: | - git config --global core.longpaths true - git config --global core.symlinks true - - # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock - # the directory on Windows and prevent GHA from checking out as reported - # in https://github.com/actions/checkout/issues/1018 - git config --global core.fsmonitor false - # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 - - name: Enable long paths on Windows - shell: powershell - run: | - Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 - # Since it's just a defensive command, the workflow should continue even the command fails. This step can be - # removed once Windows Defender is removed from the AMI - - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch - continue-on-error: true - shell: powershell - run: | - Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore - # Let's both exclude the path and disable Windows Defender completely just to be sure - # that it doesn't interfere - Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" - - uses: actions/upload-artifact@v4.4.0 - if: always() - with: - name: libtorch-cpu-shared-with-deps-release - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - libtorch-cpu-shared-with-deps-release-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - libtorch-cpu-shared-with-deps-release-build - - get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 360 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_CONFIG: release - LIBTORCH_VARIANT: shared-with-deps - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.10" - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - echo "system info $(uname -a)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main - continue-on-error: true - with: - github-secret: ${{ secrets.GITHUB_TOKEN }} - - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon - shell: bash - run: | - git config --global core.longpaths true - git config --global core.symlinks true - - # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock - # the directory on Windows and prevent GHA from checking out as reported - # in https://github.com/actions/checkout/issues/1018 - git config --global core.fsmonitor false - # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 - - name: Enable long paths on Windows - shell: powershell - run: | - Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 - # Since it's just a defensive command, the workflow should continue even the command fails. This step can be - # removed once Windows Defender is removed from the AMI - - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch - continue-on-error: true - shell: powershell - run: | - Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore - # Let's both exclude the path and disable Windows Defender completely just to be sure - # that it doesn't interfere - Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore - - name: Checkout PyTorch - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - show-progress: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - uses: actions/download-artifact@v4.1.7 - name: Download Build Artifacts - with: - name: libtorch-cpu-shared-with-deps-release - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Populate binary env - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Test PyTorch binary - shell: bash - run: | - "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" - - name: Wait until all sessions have drained - shell: powershell - working-directory: pytorch - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - working-directory: pytorch - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index a0a7495483d4..be19b8f961f4 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -37,7 +37,7 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: "linux.12xlarge" + runner: "linux.c7i.12xlarge" build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 cuda-arch-list: '9.0' diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index 41210f89c9a8..8209bf053a77 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100 on: schedule: - - cron: 15 0,12 * * 1-6 + - cron: 15 0 * * 1-6 - cron: 0 7 * * 0 # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs @@ -130,7 +130,7 @@ jobs: name: test-periodically uses: ./.github/workflows/_linux-test.yml needs: build - if: github.event.schedule == '15 0,12 * * 1-6' + if: github.event.schedule == '15 0 * * 1-6' with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml index c3b9a4229924..81c1c27b7643 100644 --- a/.github/workflows/inductor-perf-test-nightly-macos.yml +++ b/.github/workflows/inductor-perf-test-nightly-macos.yml @@ -63,6 +63,7 @@ jobs: # Same as the build job python-version: 3.12.7 test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }} + timeout-minutes: 300 disable-monitor: false monitor-log-interval: 15 monitor-data-collect-interval: 4 diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml new file mode 100644 index 000000000000..8d6da1850300 --- /dev/null +++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml @@ -0,0 +1,132 @@ +name: inductor-perf-nightly-rocm-mi300 + +on: + push: + tags: + - ciflow/inductor-perf-test-nightly-rocm-mi300/* + schedule: + - cron: 15 0 * * * + # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it + # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs + workflow_dispatch: + inputs: + training: + description: Run training (on by default)? + required: false + type: boolean + default: true + inference: + description: Run inference (on by default)? + required: false + type: boolean + default: true + default: + description: Run inductor_default? + required: false + type: boolean + default: false + dynamic: + description: Run inductor_dynamic_shapes? + required: false + type: boolean + default: false + cppwrapper: + description: Run inductor_cpp_wrapper? + required: false + type: boolean + default: false + cudagraphs: + description: Run inductor_cudagraphs? + required: false + type: boolean + default: true + freezing_cudagraphs: + description: Run inductor_cudagraphs with freezing for inference? + required: false + type: boolean + default: false + aotinductor: + description: Run aot_inductor for inference? + required: false + type: boolean + default: false + maxautotune: + description: Run inductor_max_autotune? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + linux-jammy-rocm-py3_10-inductor-benchmark-build: + if: github.repository_owner == 'pytorch' + name: rocm-py3_10-inductor-benchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-inductor-benchmark-test: + permissions: + id-token: write + contents: read + name: rocm-py3_10-inductor-benchmark-test + uses: ./.github/workflows/_rocm-test.yml + needs: linux-jammy-rocm-py3_10-inductor-benchmark-build + with: + build-environment: linux-jammy-rocm-py3_10 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true + docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }} + timeout-minutes: 720 + # Disable monitor in perf tests for more investigation + disable-monitor: true + monitor-log-interval: 10 + monitor-data-collect-interval: 2 + secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml similarity index 58% rename from .github/workflows/inductor-perf-test-nightly-rocm.yml rename to .github/workflows/inductor-perf-test-nightly-rocm-mi355.yml index f329fe74e6b6..24872d2b1f11 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml @@ -1,11 +1,11 @@ -name: inductor-perf-nightly-rocm +name: inductor-perf-nightly-rocm-mi355 on: push: tags: - - ciflow/inductor-perf-test-nightly-rocm/* + - ciflow/inductor-perf-test-nightly-rocm-mi355/* schedule: - - cron: 0 7 * * 0,3 + - cron: 15 0 * * * # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs workflow_dispatch: @@ -59,7 +59,7 @@ on: description: The list of configs used the benchmark required: false type: string - default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm + default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} @@ -88,23 +88,27 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks test-matrix: | { include: [ - { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, + { config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index 454cd166c90b..4b0e573d129c 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -106,6 +106,16 @@ jobs: { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, + { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, ]} secrets: inherit diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b1a6dfb39071..729b11157485 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -12,6 +12,7 @@ on: - landchecks/* tags: - ciflow/pull/* + - ciflow/trunk/* workflow_dispatch: permissions: read-all @@ -31,9 +32,13 @@ jobs: if: github.repository_owner == 'pytorch' name: Get changed files uses: ./.github/workflows/_get-changed-files.yml + with: + all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }} lintrunner-clang: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + # Needed to prevent deduping on HUD + name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] # Only run if there are changed files relevant to clangtidy / clangformat if: | @@ -53,7 +58,7 @@ jobs: with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" - docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 @@ -73,6 +78,7 @@ jobs: # fails to find types when it should lintrunner-mypy: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] # Only run if there are changed files relevant to mypy if: | @@ -97,6 +103,7 @@ jobs: lintrunner-noclang: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }} needs: [get-label-type, get-changed-files] with: timeout: 120 @@ -111,9 +118,9 @@ jobs: CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}" echo "Running all other linters" if [ "$CHANGED_FILES" = '*' ]; then - ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh + ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh else - ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh + ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh fi quick-checks: @@ -264,10 +271,10 @@ jobs: with: submodules: false fetch-depth: 1 - - name: Setup Python 3.9 + - name: Setup Python 3.10 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.9' + python-version: '3.10' architecture: x64 cache: pip - name: Install dependencies diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index dcdc2cd0ba24..40fb3b8d0c85 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -7,9 +7,11 @@ on: workflow_dispatch: inputs: test_mode: - required: false - type: string - default: 'short' + type: choice + options: + - 'short' + - 'long' + - 'all' description: tag filter for operator benchmarks, options from long, short, all schedule: # Run at 07:00 UTC every Sunday @@ -28,38 +30,49 @@ permissions: contents: read jobs: - opbenchmark-build: + x86-opbenchmark-build: if: github.repository_owner == 'pytorch' - name: opbenchmark-build + name: x86-opbenchmark-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.10-gcc11-build docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ - { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + { config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: inherit - opbenchmark-on-demand-build: - if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }} - name: opbenchmark-on-demand-build - uses: ./.github/workflows/_linux-build.yml + x86-opbenchmark-test: + name: x86-opbenchmark-test + uses: ./.github/workflows/_linux-test.yml + needs: x86-opbenchmark-build with: build-environment: linux-jammy-py3.10-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks + docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }} + secrets: inherit + + aarch64-opbenchmark-build: + if: github.repository_owner == 'pytorch' + name: aarch64-opbenchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-aarch64-py3.10 + runner: linux.arm64.m7g.4xlarge + docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11 test-matrix: | { include: [ - { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, + { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" }, ]} secrets: inherit - opbenchmark-test: - name: opbenchmark-test + aarch64-opbenchmark-test: + name: aarch64-opbenchmark-test uses: ./.github/workflows/_linux-test.yml - needs: opbenchmark-build + needs: aarch64-opbenchmark-build with: - build-environment: linux-jammy-py3.10-gcc11-build - docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} - test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} + build-environment: linux-jammy-aarch64-py3.10 + docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml new file mode 100644 index 000000000000..89d6d63c7287 --- /dev/null +++ b/.github/workflows/operator_microbenchmark.yml @@ -0,0 +1,100 @@ +name: operator_microbenchmark + +on: + push: + tags: + - ciflow/op-benchmark/* + workflow_dispatch: + schedule: + # Run at 06:00 UTC everyday + - cron: 0 6 * * * + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + # H100 A100 runners + opmicrobenchmark-build: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.0 9.0' + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, + ]} + secrets: inherit + + opmicrobenchmark-test: + name: opmicrobenchmark-test + uses: ./.github/workflows/_linux-test.yml + needs: opmicrobenchmark-build + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }} + secrets: inherit + + # B200 runner + opmicrobenchmark-build-b200: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build-b200 + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, + ]} + secrets: inherit + + opmicrobenchmark-test-b200: + name: opmicrobenchmark-test-b200 + uses: ./.github/workflows/_linux-test.yml + needs: opmicrobenchmark-build-b200 + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100 + docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit + + # ROCM MI300 runner + opmicrobenchmark-build-rocm: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build-rocm + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3_10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" }, + ]} + secrets: inherit + + opmicrobenchmark-test-rocm: + name: opmicrobenchmark-test-rocm + uses: ./.github/workflows/_rocm-test.yml + needs: opmicrobenchmark-build-rocm + with: + timeout-minutes: 500 + build-environment: linux-jammy-rocm-py3_10 + docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 714838eb8476..d821878074b2 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -59,13 +59,14 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-cuda12.4-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11 + cuda-arch-list: 7.5 test-matrix: | { include: [ - { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, ]} secrets: inherit @@ -112,13 +113,13 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_9-gcc9-build: - name: linux-jammy-cuda12.8-py3.9-gcc9 + linux-jammy-cuda12_8-py3_10-gcc9-build: + name: linux-jammy-cuda12.8-py3.10-gcc9 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-py3.9-gcc9 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9 cuda-arch-list: 8.6 test-matrix: | @@ -128,14 +129,14 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_9-gcc9-test: - name: linux-jammy-cuda12.8-py3.9-gcc9 + linux-jammy-cuda12_8-py3_10-gcc9-test: + name: linux-jammy-cuda12.8-py3.10-gcc9 uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_9-gcc9-build + needs: linux-jammy-cuda12_8-py3_10-gcc9-build with: - build-environment: linux-jammy-cuda12.8-py3.9-gcc9 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }} + build-environment: linux-jammy-cuda12.8-py3.10-gcc9 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }} secrets: inherit linux-jammy-cuda12_8-py3_10-gcc9-debug-build: @@ -181,11 +182,11 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 test-matrix: | { include: [ - { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, - { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, ]} secrets: inherit @@ -212,9 +213,9 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 test-matrix: | { include: [ - { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, - { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, - { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, + { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] }, ]} secrets: inherit diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ff6e9ed10711..a31a10063f1b 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,7 +127,6 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - # More memory is needed to build with asan runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan @@ -318,32 +317,6 @@ jobs: ]} secrets: inherit - linux-jammy-py3-clang12-executorch-build: - if: false # Docker build needs pin update - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3-clang12-executorch - docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch - test-matrix: | - { include: [ - { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3-clang12-executorch-test: - name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3-clang12-executorch-build - if: false # Has been broken for a while - with: - build-environment: linux-jammy-py3-clang12-executorch - docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} - secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: name: cuda12.8-py3.10-gcc9-sm75 uses: ./.github/workflows/_linux-build.yml @@ -369,14 +342,14 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-xpu-n-py3_9-build: - name: linux-jammy-xpu-n-py3.9 + linux-jammy-xpu-n-py3_10-build: + name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-n-py3.9 + build-environment: linux-jammy-xpu-n-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 test-matrix: | { include: [ diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml new file mode 100644 index 000000000000..688f557eaf0e --- /dev/null +++ b/.github/workflows/quantization-periodic.yml @@ -0,0 +1,54 @@ +name: quantization-periodic + +on: + push: + tags: + - ciflow/quantization-periodic/* + workflow_dispatch: + schedule: + # run weekly + - cron: "45 0 * * 0" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + get-default-label-prefix: + name: get-default-label-prefix + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + periodic-quantization-build: + name: periodic-quantization-build + uses: ./.github/workflows/_linux-build.yml + needs: get-default-label-prefix + with: + runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.9' + test-matrix: | + { include: [ + { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" }, + ]} + secrets: inherit + periodic-test-quantization: + name: periodic-test-quantization + uses: ./.github/workflows/_linux-test.yml + needs: periodic-quantization-build + with: + build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11 + docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml index e5dda604a4db..6d05ae9ae3ec 100644 --- a/.github/workflows/rocm-mi355.yml +++ b/.github/workflows/rocm-mi355.yml @@ -1,6 +1,9 @@ name: rocm-mi355 on: + push: + tags: + - ciflow/rocm-mi355/* workflow_dispatch: schedule: - cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT @@ -38,16 +41,16 @@ jobs: with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-noble-rocm-py3.12-mi355 - docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3 + docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3 sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" }, ]} secrets: inherit @@ -64,5 +67,7 @@ jobs: build-environment: linux-noble-rocm-py3.12-mi355 docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} - tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" + tests-to-include: >- + ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda' + || '' }} secrets: inherit diff --git a/.github/workflows/rocm-navi31.yml b/.github/workflows/rocm-navi31.yml new file mode 100644 index 000000000000..aaee8fce262b --- /dev/null +++ b/.github/workflows/rocm-navi31.yml @@ -0,0 +1,63 @@ +name: rocm-navi31 + +on: + push: + tags: + - ciflow/rocm-navi31/* + workflow_dispatch: + schedule: + # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs. + # Also run less frequently on weekends. + - cron: 45 */2 * * 1-5 + - cron: 45 4,12 * * 0,6 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + target-determination: + if: github.repository_owner == 'pytorch' + name: before-test + uses: ./.github/workflows/target_determination.yml + permissions: + id-token: write + contents: read + + linux-jammy-rocm-py3_10-build: + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + name: linux-jammy-rocm-py3.10 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 + sync-tag: rocm-build + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, + { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" }, + ]} + secrets: inherit + + linux-jammy-rocm-py3_10-test: + permissions: + id-token: write + contents: read + name: linux-jammy-rocm-py3_10 + uses: ./.github/workflows/_rocm-test.yml + needs: + - linux-jammy-rocm-py3_10-build + - target-determination + with: + build-environment: linux-jammy-rocm-py3.10 + docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} + tests-to-include: >- + ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs + test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark + inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor + inductor/test_torchinductor inductor/test_decompose_mem_bound_mm + inductor/test_flex_attention inductor/test_max_autotune' || '' }} + secrets: inherit diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index c21c851aab6d..227c7f676b1c 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -36,12 +36,12 @@ jobs: sync-tag: rocm-build test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" }, ]} secrets: inherit diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 9675ee4169f4..d4992a2ddb2c 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -140,7 +140,6 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - # More memory is needed to build with asan runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml new file mode 100644 index 000000000000..ef7f75bc4b2b --- /dev/null +++ b/.github/workflows/test-b200.yml @@ -0,0 +1,76 @@ +# B200 Smoke Tests CI Workflow +# +# This workflow runs smoke tests on B200 hardware +# +# Flow: +# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200 +# 2. Runs smoke tests on linux.dgx.b200 runner +# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function +# +# Triggered by: +# - Pull requests modifying this workflow file +# - Manual dispatch +# - Schedule (every 6 hours) +# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag) + +name: B200 Smoke Tests + +on: + pull_request: + paths: + - .github/workflows/test-b200.yml + workflow_dispatch: + schedule: + - cron: 0 4,10,16,22 * * * # every 6 hours + push: + tags: + - ciflow/b200/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + + get-label-type: + if: github.repository_owner == 'pytorch' + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-build: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '10.0' + test-matrix: | + { include: [ + { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" }, + ]} + # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm100-test: + name: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }} + aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index 1e83c7b9d98c..ec99f4473bb0 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -61,3 +61,15 @@ jobs: docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }} secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test: + name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test + uses: ./.github/workflows/_linux-test-stable-fa3.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }} + timeout-minutes: 30 + s3-bucket: gha-artifacts + secrets: inherit diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 5b1a12812003..48d1c4490d72 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -56,7 +56,7 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 build-generates-artifacts: false runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runner: "linux.4xlarge" + runner: "linux.c7i.4xlarge" test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -160,9 +160,10 @@ jobs: runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, - { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, - { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, + { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" }, ]} secrets: inherit @@ -179,13 +180,13 @@ jobs: disable-monitor: false secrets: inherit - win-vs2022-cuda12_6-py3-build: - name: win-vs2022-cuda12.6-py3 + win-vs2022-cuda12_8-py3-build: + name: win-vs2022-cuda12.8-py3 uses: ./.github/workflows/_win-build.yml needs: get-label-type with: - build-environment: win-vs2022-cuda12.6-py3 - cuda-version: "12.6" + build-environment: win-vs2022-cuda12.8-py3 + cuda-version: "12.8" runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" secrets: inherit @@ -203,7 +204,6 @@ jobs: { include: [ { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" }, - { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" }, ]} secrets: inherit @@ -221,7 +221,7 @@ jobs: build-environment: linux-jammy-rocm-py3.10 docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }} - tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl" + tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" secrets: inherit inductor-build: @@ -234,6 +234,23 @@ jobs: cuda-arch-list: '8.0' secrets: inherit + # Test cross-compiled models with Windows libs extracted from wheel + cross-compile-linux-test: + name: cross-compile-linux-test + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda12_8-py3_10-gcc11-build + - get-label-type + - win-vs2022-cuda12_8-py3-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: | + { include: [ + { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" }, + ]} + secrets: inherit + verify-cachebench-cpu-build: name: verify-cachebench-cpu-build uses: ./.github/workflows/_linux-build.yml @@ -259,3 +276,38 @@ jobs: docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} secrets: inherit + + linux-jammy-py3-clang12-executorch-build: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-py3-clang12-executorch + docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch + test-matrix: | + { include: [ + { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, + ]} + secrets: inherit + + linux-jammy-py3-clang12-executorch-test: + name: linux-jammy-py3-clang12-executorch + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-py3-clang12-executorch-build + with: + build-environment: linux-jammy-py3-clang12-executorch + docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-py3_10-gcc11-full-debug-build-only: + name: linux-jammy-py3.10-gcc11-full-debug-build-only + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runner: linux.2xlarge.memory + build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 + secrets: inherit diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 7f0fe6058bd0..b5955127d9fb 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -53,27 +53,3 @@ jobs: issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - - linux-jammy-py3_9-clang9-xla-build: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang9-xla - docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite - test-matrix: | - { include: [ - { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" }, - ]} - secrets: inherit - - linux-jammy-py3_9-clang9-xla-test: - name: linux-jammy-py3_9-clang9-xla - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_9-clang9-xla-build - with: - build-environment: linux-jammy-py3.9-clang9-xla - docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }} - secrets: inherit diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index aa12cf22b246..b3fc9efdf667 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -23,7 +23,7 @@ jobs: with: repository: pytorch/pytorch stable-branch: viable/strict - requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]' + requires: '[\"pull\", \"trunk\", \"lint\", \"linux-aarch64\"]' secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} clickhouse-url: ${{ secrets.CLICKHOUSE_URL }} clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }} @@ -48,4 +48,7 @@ jobs: echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json" pip install awscli==1.29.40 aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json" + # Push new viable/strict tag + cd pytorch/pytorch + git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}" fi diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index b2768a8f767e..3bddecdadfe3 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -42,11 +42,11 @@ jobs: build-external-packages: "vllm" build-environment: linux-jammy-cuda12.8-py3.12-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm - cuda-arch-list: '8.0;8.9;9.0' + cuda-arch-list: '8.0 8.9 9.0' runner: linux.24xlarge.memory test-matrix: | { include: [ - { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, @@ -54,7 +54,7 @@ jobs: { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, - { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, + { config: "vllm_language_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 36ba62349f28..c6bdb06812e7 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -35,7 +35,7 @@ jobs: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-xpu-n-1-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3 - runner: linux.12xlarge + runner: linux.c7i.12xlarge test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, @@ -56,7 +56,7 @@ jobs: runner_prefix: ${{ needs.get-label-type.outputs.label-type }} build-environment: linux-jammy-xpu-n-py3.10 docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 - runner: linux.12xlarge + runner: linux.c7i.12xlarge test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" }, diff --git a/.gitignore b/.gitignore index 2dd40f8cfa85..447ef777e929 100644 --- a/.gitignore +++ b/.gitignore @@ -82,12 +82,13 @@ torch/return_types.pyi torch/nn/functional.pyi torch/utils/data/datapipes/datapipe.pyi torch/csrc/autograd/generated/* +torch/csrc/functionalization/generated/* torch/csrc/lazy/generated/*.[!m]* torch_compile_debug/ # Listed manually because some files in this directory are not generated torch/testing/_internal/generated/annotated_fn_args.py torch/testing/_internal/data/*.pt -torch/csrc/api/include/torch/version.h +torch/headeronly/version.h torch/csrc/cudnn/cuDNN.cpp torch/csrc/generated torch/csrc/generic/TensorMethods.cpp @@ -259,6 +260,9 @@ gen .pytest_cache aten/build/* +# Linker scripts for prioritized text optimization +cmake/linker_script.ld + # Bram plsdontbreak @@ -370,6 +374,7 @@ third_party/ruy/ third_party/glog/ # Virtualenv +.venv/ venv/ # Log files @@ -391,3 +396,4 @@ android/pytorch_android_torchvision/.cxx CLAUDE.local.md /test_*.py /debug_*.py +CLAUDE_CONTEXT/ diff --git a/.lintrunner.toml b/.lintrunner.toml index 1f79f1eb971d..411e4d2c215b 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -18,6 +18,7 @@ exclude_patterns = [ 'torch/_inductor/autoheuristic/artifacts/**', 'scripts/**', 'test/generated_type_hints_smoketest.py', + 'test/test_torchfuzz_repros.py', # CPython tests 'test/dynamo/cpython/**', # Tests from the NumPy test suite @@ -27,6 +28,7 @@ exclude_patterns = [ 'torch/lib/**', 'venv/**', '**/*.pyi', + "tools/experimental/torchfuzz/**", 'tools/test/test_selective_build.py', ] command = [ @@ -49,7 +51,7 @@ init_command = [ 'mccabe==0.7.0', 'pycodestyle==2.14.0', 'pyflakes==3.4.0', - 'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"', + 'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"', ] @@ -123,6 +125,7 @@ is_formatter = true code = 'MYPY' include_patterns = [ 'setup.py', + 'functorch/dim/**/*.py', 'torch/**/*.py', 'torch/**/*.pyi', 'caffe2/**/*.py', @@ -152,7 +155,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"', + 'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"', 'numpy==2.1.0 ; python_version >= "3.12"', 'expecttest==0.3.0', 'mypy==1.16.0', @@ -195,6 +198,7 @@ exclude_patterns = [ 'tools/test/gen_operators_yaml_test.py', 'tools/test/gen_oplist_test.py', 'tools/test/test_selective_build.py', + 'tools/experimental/torchfuzz/**', ] command = [ 'python3', @@ -205,6 +209,46 @@ command = [ '@{{PATHSFILE}}' ] + +[[linter]] +code = 'PYREFLY' +include_patterns = [ + 'torch/**/*.py', + 'torch/**/*.pyi', + 'torchgen/**/*.py', + 'torchgen/**/*.pyi', + 'functorch/**/*.py', + 'functorch/**/*.pyi', +] +exclude_patterns = [] +command = [ + 'python3', + 'tools/linter/adapters/pyrefly_linter.py', + '--config=pyrefly.toml', +] +init_command = [ + 'python3', + 'tools/linter/adapters/pip_init.py', + '--dry-run={{DRYRUN}}', + 'numpy==2.1.0 ; python_version >= "3.12"', + 'expecttest==0.3.0', + 'pyrefly==0.36.2', + 'sympy==1.13.3', + 'types-requests==2.27.25', + 'types-pyyaml==6.0.2', + 'types-tabulate==0.8.8', + 'types-protobuf==5.29.1.20250403', + 'types-setuptools==79.0.0.20250422', + 'types-jinja2==2.11.9', + 'types-colorama==0.4.6', + 'filelock==3.18.0', + 'junitparser==2.1.1', + 'rich==14.1.0', + 'optree==0.17.0', + 'types-openpyxl==3.1.5.20250919', + 'types-python-dateutil==2.9.0.20251008' +] + [[linter]] code = 'CLANGTIDY' include_patterns = [ @@ -964,7 +1008,6 @@ exclude_patterns = [ 'test/jit/**', # should be run through test/test_jit.py 'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py 'test/fx/**', # should be run through test/test_fx.py - 'test/bottleneck_test/**', # excluded by test/run_test.py 'test/package/**', # excluded by test/run_test.py 'test/distributed/argparse_util_test.py', 'test/distributed/bin/test_script.py', @@ -1259,6 +1302,7 @@ exclude_patterns = [ 'test/test_masked.py', 'test/test_maskedtensor.py', 'test/test_matmul_cuda.py', + 'test/test_scaled_matmul_cuda.py', 'test/test_meta.py', 'test/test_metal.py', 'test/test_mkl_verbose.py', @@ -1410,8 +1454,6 @@ exclude_patterns = [ 'torch/utils/benchmark/utils/timer.py', 'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py', 'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py', - 'torch/utils/bottleneck/__init__.py', - 'torch/utils/bottleneck/__main__.py', 'torch/utils/bundled_inputs.py', 'torch/utils/checkpoint.py', 'torch/utils/collect_env.py', @@ -1454,7 +1496,7 @@ init_command = [ '--dry-run={{DRYRUN}}', 'usort==1.0.8.post1', 'isort==6.0.1', - 'ruff==0.12.9', # sync with RUFF + 'ruff==0.13.1', # sync with RUFF ] is_formatter = true @@ -1571,6 +1613,7 @@ exclude_patterns = [ 'torch/_inductor/fx_passes/serialized_patterns/**', 'torch/_inductor/autoheuristic/artifacts/**', 'test/dynamo/cpython/**', + 'test/test_torchfuzz_repros.py', 'scripts/**', 'third_party/**', 'fb/**', @@ -1588,7 +1631,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.12.9', # sync with PYFMT + 'ruff==0.13.1', # sync with PYFMT ] is_formatter = true diff --git a/BUILD.bazel b/BUILD.bazel index 635f39eed2ce..4737a2a0c486 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources") load("//:tools/bazel.bzl", "rules") +# Export files for use by torch/headeronly (where version.h generation now lives) +exports_files(["version.txt"]) + define_targets(rules = rules) COMMON_COPTS = [ @@ -22,6 +25,7 @@ COMMON_COPTS = [ "-DHAVE_SHM_UNLINK=1", "-D_FILE_OFFSET_BITS=64", "-DUSE_FBGEMM", + "-DUSE_DISTRIBUTED", "-DAT_PER_OPERATOR_HEADERS", "-DATEN_THREADING=NATIVE", "-DNO_CUDNN_DESTROY_HANDLE", @@ -90,6 +94,8 @@ generated_cpu_cpp = [ "aten/src/ATen/NativeMetaFunctions.h", "aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/VmapGeneratedPlumbing.h", + "aten/src/ATen/ViewMetaClasses.h", + "aten/src/ATen/ViewMetaClasses.cpp", "aten/src/ATen/core/aten_interned_strings.h", "aten/src/ATen/core/enum_tag.h", "aten/src/ATen/core/TensorBody.h", @@ -687,7 +693,9 @@ cc_library( "torch/csrc/*/generated/*.h", "torch/csrc/jit/serialization/mobile_bytecode_generated.h", ] + torch_cuda_headers, - ) + GENERATED_AUTOGRAD_CPP + [":version_h"], + ) + GENERATED_AUTOGRAD_CPP + [ + "//torch/headeronly:version_h", + ], includes = [ "third_party/kineto/libkineto/include", "torch/csrc", @@ -810,7 +818,7 @@ cc_library( name = "torch_python", srcs = libtorch_python_core_sources + if_cuda(libtorch_python_cuda_sources) - + libtorch_python_distributed_sources + + if_cuda(libtorch_python_distributed_sources) + GENERATED_AUTOGRAD_PYTHON, hdrs = glob([ "torch/csrc/generic/*.cpp", @@ -832,36 +840,6 @@ pybind_extension( ], ) -cc_library( - name = "functorch", - hdrs = glob([ - "functorch/csrc/dim/*.h", - ]), - srcs = glob([ - "functorch/csrc/dim/*.cpp", - ]), - deps = [ - ":aten_nvrtc", - ":torch_python", - "@pybind11", - ], -) - -pybind_extension( - name = "functorch/_C", - copts=[ - "-DTORCH_EXTENSION_NAME=_C" - ], - srcs = [ - "functorch/csrc/init_dim_only.cpp", - ], - deps = [ - ":functorch", - ":torch_python", - ":aten_nvrtc", - ], -) - cc_binary( name = "torch/bin/torch_shm_manager", srcs = [ @@ -902,7 +880,6 @@ py_library( ], data = [ ":torch/_C.so", - ":functorch/_C.so", ":torch/bin/torch_shm_manager", ], ) @@ -1105,6 +1082,7 @@ test_suite( "aten/src/ATen/templates/LazyNonNativeIr.h", "aten/src/ATen/templates/RegisterDispatchKey.cpp", "aten/src/ATen/templates/RegisterDispatchDefinitions.ini", + "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp", "aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/tags.yaml", "aten/src/ATen/native/ts_native_functions.yaml", diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fba0eea881b..0b88247df27a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,4 @@ cmake_minimum_required(VERSION 3.27 FATAL_ERROR) -# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW) # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this # sometimes makes XCode C compiler gets detected as "Clang", even when the C++ @@ -181,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)") set(CPU_POWER ON) endif() -# For non-supported platforms, turn USE_DISTRIBUTED off by default. -# NB: USE_DISTRIBUTED simply disables the backend; distributed code -# still gets built +# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not +# tested and likely won't work without additional changes. if(NOT LINUX AND NOT WIN32) set(USE_DISTRIBUTED OFF @@ -263,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF) -option(USE_DISTRIBUTED "Enable default distributed backends" ON) +option(USE_DISTRIBUTED "Use distributed" ON) cmake_dependent_option(USE_NCCL "Use NCCL" ON "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_XCCL "Use XCCL" ON - "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF) + "USE_XPU;UNIX;NOT APPLE" OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF) cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF) cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) @@ -380,12 +378,19 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) +# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. +set(USE_PRIORITIZED_TEXT_DEFAULT OFF) +if(LINUX AND CPU_AARCH64) + set(USE_PRIORITIZED_TEXT_DEFAULT ON) +endif() +cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." + "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance -# on Windows. +# on Windows and AArch64. option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF) -if(WIN32) +if(WIN32 OR (CPU_AARCH64 AND NOT APPLE)) set(USE_MIMALLOC ON) # Not enable USE_MIMALLOC_ON_MKL due to it caused issue: @@ -432,11 +437,12 @@ if(WIN32) PATH_SUFFIXES lib NO_DEFAULT_PATH) if(NOT libuv_tmp_LIBRARY) + set(USE_DISTRIBUTED OFF) set(USE_GLOO OFF) message( WARNING - "Libuv is not installed in current conda env. Set USE_GLOO to OFF. " - "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv." + "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. " + "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv." ) else() set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../) @@ -657,6 +663,11 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") +# Set linker max-page-size to 64KiB on AArch64 Linux +if(LINUX AND CPU_AARCH64) + add_link_options_if_supported("-z,max-page-size=0x10000") +endif() + # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -877,23 +888,28 @@ cmake_dependent_option( "(USE_CUDA AND NOT MSVC) OR USE_ROCM" OFF) + +IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH) + message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build") + set(USE_FBGEMM_GENAI_DEFAULT ON) +elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) + message(STATUS "Setting USE_FBGEMM_GENAI to ON by default , doing CUDA build for SM100a") + set(USE_FBGEMM_GENAI_DEFAULT ON) +else() + set(USE_FBGEMM_GENAI_DEFAULT OFF) +endif() + cmake_dependent_option( USE_FBGEMM_GENAI "Whether to build FBGEMM GenAI quantized GEMM kernels.\ Will be disabled if not supported by the platform" - ON - "USE_ROCM" + ${USE_FBGEMM_GENAI_DEFAULT} + "(USE_CUDA AND NOT MSVC) OR USE_ROCM" OFF) -IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) - message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") - set(USE_FBGEMM_GENAI off) -endif() # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100. if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32) - message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a") - set(USE_FBGEMM_GENAI ON) endif() # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem @@ -1379,10 +1395,6 @@ endif() include(cmake/Summary.cmake) caffe2_print_configuration_summary() -if(BUILD_FUNCTORCH) - add_subdirectory(functorch) -endif() - # Parse custom debug info if(DEFINED USE_CUSTOM_DEBINFO) string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}") @@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() + +if(USE_PRIORITIZED_TEXT_FOR_LD) + add_compile_options( + $<$:-ffunction-sections> + $<$:-fdata-sections> + ) + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + + add_custom_command( + OUTPUT "${LINKER_SCRIPT_FILE_OUT}" + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" + DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" + COMMENT "Generating prioritized text linker files" + VERBATIM + ) + + add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + + if(BUILD_PYTHON) + set(LINKER_OPT_TARGETS torch_python) + endif() + + if(NOT BUILD_LIBTORCHLESS) + list(APPEND LINKER_OPT_TARGETS torch_cpu c10) + if(USE_CUDA) + list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) + endif() + if(USE_XPU) + list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) + endif() + if(USE_ROCM) + list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) + endif() + endif() + + foreach(tgt IN LISTS LINKER_OPT_TARGETS) + if(TARGET ${tgt}) + add_dependencies("${tgt}" generate_linker_script) + target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") + set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + else() + message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") + endif() + endforeach() + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() diff --git a/CODEOWNERS b/CODEOWNERS index 1d91adacb062..cc249dc4f43a 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd /torch/csrc/jit/python/init.cpp @mikaylagawarecki # CUDA and CUDA math libraries -aten/src/ATen/cuda/ @eqy @syed-ahmed -aten/src/ATen/cudnn/ @eqy @syed-ahmed -aten/src/ATen/native/cuda/ @eqy @syed-ahmed -aten/src/ATen/native/cudnn/ @eqy @syed-ahmed -c10/cuda @eqy @syed-ahmed -torch/cuda/ @eqy @syed-ahmed -torch/csrc/cuda/ @eqy @syed-ahmed -torch/backends/cuda/ @eqy @syed-ahmed -torch/backends/cudnn/ @eqy @syed-ahmed +aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A +aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A +aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A +aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A +c10/cuda @eqy @syed-ahmed @Aidyn-A +torch/cuda/ @eqy @syed-ahmed @Aidyn-A +torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A +torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A +torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A # PyTree utilities /torch/utils/_pytree.py @XuehaiPan @@ -201,3 +201,17 @@ torch/backends/cudnn/ @eqy @syed-ahmed /torch/csrc/stable/ @janeyx99 @mikaylagawarecki /torch/headeronly/ @janeyx99 /torch/header_only_apis.txt @janeyx99 + +# FlexAttention +/torch/nn/attention/flex_attention.py @drisspg +/torch/_higher_order_ops/flex_attention.py @drisspg +/torch/_inductor/kernel/flex/ @drisspg +/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg +/test/inductor/test_flex_attention.py @drisspg +/test/inductor/test_flex_decoding.py @drisspg + +# Low Precision GEMMs +/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58 +/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58 +/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58 +/test/test_scaled_matmul_cuda.py @drisspg @slayton58 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9d2b5d355391..4c46077f9db7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -81,7 +81,7 @@ git remote add upstream git@github.com:pytorch/pytorch.git make setup-env # Or run `make setup-env-cuda` for pre-built CUDA binaries # Or run `make setup-env-rocm` for pre-built ROCm binaries -source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` ### Tips and Debugging @@ -182,28 +182,36 @@ You can use this script to check out a new nightly branch with the following: ```bash ./tools/nightly.py checkout -b my-nightly-branch -source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`: ```bash ./tools/nightly.py checkout -b my-nightly-branch --cuda -source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`: ```bash ./tools/nightly.py checkout -b my-nightly-branch --rocm -source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` You can also use this tool to pull the nightly commits into the current branch: ```bash -./tools/nightly.py pull -p my-env -source my-env/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows +./tools/nightly.py pull +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows +``` + +To create the virtual environment with a specific Python interpreter, you can +pass in the `--python` argument: + +```bash +./tools/nightly.py --python /path/to/python3.12 +source venv/bin/activate # or `. .\venv\Scripts\activate` on Windows ``` Pulling will recreate a fresh virtual environment and reinstall the development diff --git a/Dockerfile b/Dockerfile index 7b8964bd860e..331cf00593cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,11 +50,10 @@ RUN git submodule update --init --recursive FROM conda as conda-installs ARG PYTHON_VERSION=3.11 ARG CUDA_PATH=cu121 -ARG CUDA_CHANNEL=nvidia ARG INSTALL_CHANNEL=whl/nightly # Automatically set by buildx -RUN /opt/conda/bin/conda update -y -n base -c defaults conda -RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} +# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574 +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0 ARG TARGETPLATFORM diff --git a/MANIFEST.in b/MANIFEST.in index ec00f251160b..bb8e488283a9 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,20 +1,61 @@ # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html -# Include source files in SDist -include CMakeLists.txt -include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE -include BUCK BUCK.* -include requirements*.txt -include version.txt -include [Mm]akefile *.[Mm]akefile [Mm]akefile.* -include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore +# Include individual top-level files +include CITATION.cff +include CODEOWNERS +include Dockerfile +include LICENSE +include MANIFEST.in +include Makefile +include NOTICE +include .bc-linter.yml +include .clang-format .clang-tidy +include .cmakelintrc +include .coveragerc +include .dockerignore +include .editorconfig +include .flake8 +include .gdbinit +include .lintrunner.toml +include .lldbinit +include codex_setup.sh +include docker.Makefile +include pyrefly.toml +include ubsan.supp + +# Include bazel and BUCK related files +include BUILD.bazel BUCK.oss +include WORKSPACE +include *.bzl +include .bazelignore .bazelrc .bazelversion + +# Include general configuration files +include *.ini +# Include important top-level information +include *.md +# Include technical text files at the moment, comprises +# version.txt, CMakeLists.txt, requirements.txt +include *.txt + +# Include ctags configuration +include .ctags.d/*.ctags + +# Include subfolders completely +graft .devcontainer +graft .vscode graft android graft aten +graft benchmarks graft binaries graft c10 graft caffe2 graft cmake +graft docs graft functorch +graft ios +graft mypy_plugins +graft scripts +graft test graft third_party graft tools graft torch @@ -22,29 +63,37 @@ graft torchgen # FIXME: torch-xla build during codegen will fail if include this file in wheel exclude torchgen/BUILD.bazel -# Misc files and directories in SDist -include *.md -include CITATION.cff -include LICENSE NOTICE -include mypy*.ini -graft benchmarks -graft docs -graft mypy_plugins -graft scripts +# The following exclusions omit parts from third-party dependencies that +# contain invalid symlinks[1] and that are not needed for pytorch, such as +# bindings for unused languages +prune third_party/flatbuffers/java +prune third_party/flatbuffers/kotlin +prune third_party/ittapi/rust +prune third_party/nccl/pkg/debian +prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-* + +# The following document is also an invalid symlink[1] and superfluous +exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md + +# Omit autogenerated code +prune torchgen/packaged + +# Omit caches, compiled, and scm related content +prune */__pycache__ +prune **/.github +prune **/.gitlab +global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib +global-exclude *.py[cod] *.swp *~ +global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules +global-exclude .gitlab-ci.yml # Misc files needed for custom setuptools command include .gitignore include .gitmodules -# Include test suites in SDist -graft test -include pytest.ini -include .coveragerc - -# Prune generated/compiled files -prune torchgen/packaged -prune */__pycache__ -global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod] +# [1] Invalid symlinks for the purposes of Python source distributions are, +# according to the source distribution format[2] links pointing outside the +# destination directory or links with a `..` component, which is those of +# concern here. -prune */.git -global-exclude .git *~ *.swp +# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features diff --git a/README.md b/README.md index 99e6dabd1618..61b4447ddf4d 100644 --- a/README.md +++ b/README.md @@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) #### Prerequisites If you are installing from source, you will need: -- Python 3.9 or later +- Python 3.10 or later - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux) - Visual Studio or Visual Studio Build Tool (Windows only) @@ -275,7 +275,7 @@ conda install pkg-config libuv pip install mkl-static mkl-include # Add these packages if torch.distributed is needed. # Distributed package support on Windows is a prototype feature and is subject to changes. -conda install -c conda-forge libuv +conda install -c conda-forge libuv=1.51 ``` #### Install PyTorch diff --git a/RELEASE.md b/RELEASE.md index 52371e73f0a6..87f042d659fd 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,6 +3,7 @@ - [Release Compatibility Matrix](#release-compatibility-matrix) + - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix) - [Release Cadence](#release-cadence) - [General Overview](#general-overview) - [Frequently Asked Questions](#frequently-asked-questions) @@ -63,6 +64,22 @@ Following is the Release Compatibility Matrix for PyTorch releases: | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 | | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 | +### PyTorch CUDA Support Matrix + +For Release 2.9 PyTorch Supports following CUDA Architectures: + +| CUDA | architectures supported for Linux x86 and Windows builds | notes | +| --- | --- | --- | +| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | | +| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0) | | +| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only | + +| CUDA | architectures supported for Linux aarch64 builds | +| --- | --- | +| 12.6.3 | Ampere(8.0), Hopper(9.0) | +| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0) | +| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) | + ## Release Cadence Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional. diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h index 307793301441..03b00cc21564 100644 --- a/aten/src/ATen/BlasBackend.h +++ b/aten/src/ATen/BlasBackend.h @@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) { return stream << BlasBackendToString(backend); } +namespace blas { + +enum class ScalingType : std::uint8_t { + TensorWise, // fp32 scales + RowWise, // fp32 scales + BlockWise1x16, // fp8_e4m3fn scales + BlockWise1x32, // fp8_e8m0fnu scales + BlockWise1x128, // fp32 scales + BlockWise128x128, // fp32 scales +}; + +enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 }; + +} // namespace blas + } // namespace at diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index aa250c8b7fae..6bf0797b9e46 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -256,6 +256,7 @@ endif() IF(USE_FBGEMM_GENAI) set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/) set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) + if(USE_CUDA) # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build. # If you want to integrate a kernel from FBGEMM into torch, you have to add it here. @@ -292,48 +293,65 @@ IF(USE_FBGEMM_GENAI) "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/" ) - target_include_directories(fbgemm_genai PUBLIC + target_include_directories(fbgemm_genai PRIVATE ${FBGEMM_THIRD_PARTY}/cutlass/include ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include ${fbgemm_genai_mx8mx8bf16_grouped} ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h ) - else() - if(USE_ROCM) - # Only include the kernels we want to build to avoid increasing binary size. - file(GLOB_RECURSE fbgemm_genai_native_rocm_hip - "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" - "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") - set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - # Add additional HIPCC compiler flags for performance - set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS - -mllvm - -amdgpu-coerce-illegal-types=1 - -mllvm - -enable-post-misched=0 - -mllvm - -greedy-reverse-local-assignment=1 - -fhip-new-launch-api) - - hip_add_library( - fbgemm_genai STATIC - ${fbgemm_genai_native_rocm_hip} - HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) - set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) - - target_include_directories(fbgemm_genai PUBLIC - # FBGEMM version of Composable Kernel is used due to some customizations - ${FBGEMM_THIRD_PARTY}/composable_kernel/include - ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include - ${FBGEMM_THIRD_PARTY}/cutlass/include - ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include - ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp - ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h - ) + + # Add FBGEMM_GENAI include directories for torch_ops.h + list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include) + list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include) + elseif(USE_ROCM) + # Only include the kernels we want to build to avoid increasing binary size. + file(GLOB_RECURSE fbgemm_genai_native_rocm_hip + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") + set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + + # Add additional HIPCC compiler flags for performance + set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS + -mllvm + -enable-post-misched=0 + -mllvm + -greedy-reverse-local-assignment=1 + -fhip-new-launch-api) + if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0") + list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1) + endif() + + # Only compile for gfx942 for now. + # This is rather hacky, I could not figure out a clean solution :( + set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS}) + string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}") + if("gfx942" IN_LIST PYTORCH_ROCM_ARCH) + list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;) endif() + set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS}) + + hip_add_library( + fbgemm_genai STATIC + ${fbgemm_genai_native_rocm_hip} + HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) + set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL}) + set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) + + target_include_directories(fbgemm_genai PRIVATE + # FBGEMM version of Composable Kernel is used due to some customizations + ${FBGEMM_THIRD_PARTY}/composable_kernel/include + ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include + ${FBGEMM_THIRD_PARTY}/cutlass/include + ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include + ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp + ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h + ) + + # Add FBGEMM_GENAI include directories for torch_ops.h + list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include) + list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include) endif() endif() @@ -595,6 +613,11 @@ if(UNIX) if(HAVE_MALLOC_USABLE_SIZE) add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1) endif(HAVE_MALLOC_USABLE_SIZE) + set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h") + CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE) + if(HAVE_POSIX_FALLOCATE) + add_definitions(-DHAVE_POSIX_FALLOCATE=1) + endif(HAVE_POSIX_FALLOCATE) endif(UNIX) ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC) @@ -677,12 +700,6 @@ if(USE_CUDA AND NOT USE_ROCM) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include) - # Add FBGEMM_GENAI include directories for torch_ops.h - if(USE_FBGEMM_GENAI) - list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include) - list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include) - endif() - if($ENV{ATEN_STATIC_CUDA}) if(CUDA_VERSION VERSION_LESS_EQUAL 12.9) list(APPEND ATen_CUDA_DEPENDENCY_LIBS diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 39932b1c4398..8b283556c7a4 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef tensors) { inline bool _apply_preamble(ArrayRef tensors) { checkDeviceType("CPU_tensor_apply", tensors, kCPU); checkLayout("CPU_tensor_apply", tensors, kStrided); - if (!_all_equal_numel(tensors)) - TORCH_CHECK(false, _all_equal_numel_error(tensors)); + TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors)); // An empty tensor has no elements for (auto& t : tensors) if (t.numel() == 0) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 4d48084b0ab8..3310abfb41d5 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -40,41 +40,6 @@ namespace { ->conv ->rnn */ -const std::map> _fp32_precisions = { - {"generic", {{"ieee", "tf32", "bf16", "none"}}}, - {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}}, - {"cuda", {{"ieee", "tf32", "none"}}}}; - -// Check whether the backend and op are legal -void check_fp32_prec_backend_and_op( - const std::string& backend, - const std::string& op) { - static std::vector backends = {"generic", "mkldnn", "cuda"}; - static std::vector operators = {"conv", "matmul", "rnn", "all"}; - TORCH_CHECK( - std::find(backends.begin(), backends.end(), backend) != backends.end(), - "Invalid backend: ", - backend); - TORCH_CHECK( - std::find(operators.begin(), operators.end(), op) != operators.end(), - "Invalid operator: ", - op); - if (backend == "generic") { - TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op); - } - } - - // Return whether the precision is supported by backends - bool validate_fp32_prec( - const std::string& backend, - const std::string& precision) { - auto iterp = _fp32_precisions.find(backend); - TORCH_CHECK(iterp != _fp32_precisions.end()); - auto precisions = iterp->second; - bool valid = std::find(precisions.begin(), precisions.end(), precision) != - precisions.end(); - return valid; - } C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){ TORCH_WARN_ONCE( @@ -86,6 +51,54 @@ void check_fp32_prec_backend_and_op( } } // namespace +Float32Backend str2backend(const std::string& name) { + if (name == "generic") + return Float32Backend::GENERIC; + else if (name == "cuda") + return Float32Backend::CUDA; + else if (name == "mkldnn") + return Float32Backend::MKLDNN; + TORCH_CHECK(false, "Unknown backend: ", name); +} + +Float32Op str2op(const std::string& name) { + if (name == "all") + return Float32Op::ALL; + else if (name == "conv") + return Float32Op::CONV; + else if (name == "rnn") + return Float32Op::RNN; + else if (name == "matmul") + return Float32Op::MATMUL; + TORCH_CHECK(false, "Unknown op: ", name); +} + +Float32Precision str2precision(const std::string& name) { + if (name == "none") + return Float32Precision::NONE; + else if (name == "ieee") + return Float32Precision::IEEE; + else if (name == "tf32") + return Float32Precision::TF32; + else if (name == "bf16") + return Float32Precision::BF16; + TORCH_CHECK(false, "Unknown precision: ", name); +} + +std::string precision2str(Float32Precision prec) { + switch (prec) { + case Float32Precision::NONE: + return "none"; + case Float32Precision::IEEE: + return "ieee"; + case Float32Precision::TF32: + return "tf32"; + case Float32Precision::BF16: + return "bf16"; + } + TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast(prec), ")"); +} + Context::Context() = default; // TODO: This could be bad juju if someone calls globalContext() in the @@ -179,10 +192,10 @@ void Context::setUserEnabledNNPACK(bool e) { enabled_nnpack = e; } -bool Context::allowTF32CuDNN(const std::string& op) const { - if (op.size() == 0){ - bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32"; - bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32"; +bool Context::allowTF32CuDNN(std::optional op) const { + if (!op.has_value()) { + bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32; + bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32; TORCH_CHECK( allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn, "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,", @@ -191,15 +204,15 @@ bool Context::allowTF32CuDNN(const std::string& op) const { "We suggest only using the new API to set the TF32 flag(s). See also: ", "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"); } else { - return float32Precision("cuda", op) == "tf32"; + return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32; } warn_deprecated_fp32_precision_api(); return allow_tf32_cudnn; } void Context::setAllowTF32CuDNN(bool b) { - setFloat32Precision("cuda", "rnn", b ? "tf32" : "none"); - setFloat32Precision("cuda", "conv", b ? "tf32" : "none"); + setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE); + setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE); allow_tf32_cudnn = b; warn_deprecated_fp32_precision_api(); } @@ -279,45 +292,6 @@ bool Context::userEnabledOverrideableSDP() const { return enabled_overrideable; } -static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG"; -static constexpr const std::array cublas_deterministic_configs = {":4096:8", ":16:8"}; -#ifdef USE_ROCM -static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32"; -#endif - -bool Context::checkCuBLASConfigDeterministic() { - // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config - // is set to deterministic setting - if (hasCUDART()) { - const auto workspace_config = c10::utils::get_env(cublas_config_var_name); - return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]); - } - return true; -} - -void Context::alertCuBLASConfigNotDeterministic() const { - static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic(); - if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) { - return; - } - - auto msg = c10::str( - "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ", - "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ", - "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ", - "case, you must set an environment variable before running your PyTorch application: ", - cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ", - cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ", - "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility" - ); - - if (deterministicAlgorithmsWarnOnly()) { - TORCH_WARN(msg); - } else { - TORCH_CHECK(false, msg); - } -} - bool Context::benchmarkCuDNN() const { return benchmark_cudnn; } @@ -343,14 +317,8 @@ void Context::setImmediateMiopen(bool b) { } bool Context::allowTF32CuBLAS() const { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - return false; - } -#endif bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST; - bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32"; + bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32; TORCH_CHECK( legacy_allow_tf32 == allow_tf32_new, "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,", @@ -362,26 +330,18 @@ bool Context::allowTF32CuBLAS() const { } void Context::setAllowTF32CuBLAS(bool b) { -#ifdef USE_ROCM - const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32); - if (allow_tf32 != true) { - C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. " - << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it."; - return; - } -#endif float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST; - setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee"); + setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE); } Float32MatmulPrecision Context::float32MatmulPrecision() const { - bool invalid = float32Precision("cuda", "matmul") == "tf32" && + bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 && float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST; invalid = invalid || - (float32Precision("mkldnn", "matmul") == "bf16" && + (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 && float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM); invalid = invalid || - (float32Precision("mkldnn", "matmul") == "tf32" && + (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 && float32_matmul_precision != at::Float32MatmulPrecision::HIGH); TORCH_CHECK( !invalid, @@ -393,15 +353,26 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const { return float32_matmul_precision; } -std::string Context::float32Precision(const std::string& backend, const std::string& op) const { - check_fp32_prec_backend_and_op(backend, op); - auto precision = fp32_precision.find(backend)->second.find(op)->second; - if (precision == "none") - precision = fp32_precision.find(backend)->second.find("all")->second; - if (precision == "none") - precision = fp32_precision.find("generic")->second.find("all")->second; - bool valid_prec = validate_fp32_prec(backend, precision); - return valid_prec ? precision : "none"; +Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const { + std::pair key{backend, op}; + auto it = fp32_precision.find(key); + TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")"); + + Float32Precision precision = it->second; + if (precision == Float32Precision::NONE) { + key.second = Float32Op::ALL; + precision = fp32_precision.find(key)->second; + } + if (precision == Float32Precision::NONE) { + key.first = Float32Backend::GENERIC; + precision = fp32_precision.find(key)->second; + } + + // "cuda" does not support "bf16" + if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) { + return Float32Precision::NONE; + } + return precision; } void Context::setFloat32MatmulPrecision(const std::string &s) { @@ -410,18 +381,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention if (s_ == "highest") { float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST; - setFloat32Precision("cuda", "matmul", "ieee"); - setFloat32Precision("mkldnn", "matmul", "ieee"); + setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE); + setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE); return true; } else if (s_ == "high") { float32_matmul_precision = at::Float32MatmulPrecision::HIGH; - setFloat32Precision("cuda", "matmul", "tf32"); - setFloat32Precision("mkldnn", "matmul", "tf32"); + setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32); + setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32); return true; } else if (s_ == "medium") { float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM; - setFloat32Precision("cuda", "matmul", "tf32"); - setFloat32Precision("mkldnn", "matmul", "bf16"); + setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32); + setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16); return true; } return false; @@ -435,25 +406,16 @@ void Context::setFloat32MatmulPrecision(const std::string &s) { "setFloat32MatmulPrecision call has no effect."); } -void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) { - check_fp32_prec_backend_and_op(backend, op); - if (validate_fp32_prec(backend, p)) { - fp32_precision[backend][op] = p; - } else { - std::string msg; - auto iterp = _fp32_precisions.find(backend); - TORCH_CHECK(iterp != _fp32_precisions.end()); - for (auto p : iterp->second) { - msg += p; - msg += " "; - } - TORCH_WARN( - "you have set wrong precision for backend:", - backend, - " setFloat32Precision call has no effect.", - "Please choose precision from: ", - msg); - } +void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) { + auto it = fp32_precision.find(std::make_pair(backend, op)); + TORCH_CHECK( + it != fp32_precision.end(), + "Invalid (backend, op) pair: (", backend, ", ", op, ")"); + TORCH_CHECK( + !(backend == Float32Backend::CUDA && p == Float32Precision::BF16), + "backend 'cuda' does not support precision 'bf16'"); + + it->second = p; } at::LinalgBackend Context::linalgPreferredBackend() const { @@ -521,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() { #if ROCM_VERSION >= 60300 "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908", #endif -#if ROCM_VERSION >= 60500 - "gfx950" +#if ROCM_VERSION >= 70000 + "gfx950", "gfx1150", "gfx1151" #endif }; for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) { @@ -625,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) { rocm_fa_preferred_backend = b; } -bool Context::allowFP16ReductionCuBLAS() const { +CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const { return allow_fp16_reduction_cublas; } -void Context::setAllowFP16ReductionCuBLAS(bool b) { - allow_fp16_reduction_cublas = b; +CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) { + TORCH_CHECK( + !(allow_reduced_precision && !allow_splitk), + "allow_splitk=False is not supported when reduced precision reductions are enabled"); + if (allow_reduced_precision) { + return CuBLASReductionOption::AllowReducedPrecisionWithSplitK; + } else if (allow_splitk) { + return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK; + } else { + return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK; + } +} + +void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) { + allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk); } -bool Context::allowBF16ReductionCuBLAS() const { +CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const { return allow_bf16_reduction_cublas; } -void Context::setAllowBF16ReductionCuBLAS(bool b) { - allow_bf16_reduction_cublas = b; +void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) { + allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk); } bool Context::allowFP16AccumulationCuBLAS() const { diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 5cfa9b23e20a..d0f6ce18862a 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -25,11 +25,13 @@ #include #include #include +#include #include #include #include #include +#include namespace at { @@ -37,6 +39,20 @@ class Tensor; enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM }; +enum class CuBLASReductionOption : uint8_t { + AllowReducedPrecisionWithSplitK = 0, + DisallowReducedPrecisionAllowSplitK = 1, + DisallowReducedPrecisionDisallowSplitK = 2, +}; +enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN }; +enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL }; +enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 }; + +TORCH_API Float32Backend str2backend(const std::string& name); +TORCH_API Float32Op str2op(const std::string& name); +TORCH_API Float32Precision str2precision(const std::string& name); +TORCH_API std::string precision2str(Float32Precision prec); + class TORCH_API Context { public: Context(); @@ -210,15 +226,15 @@ class TORCH_API Context { bool userEnabledMkldnn() const; void setUserEnabledMkldnn(bool e); bool benchmarkCuDNN() const; - void setBenchmarkCuDNN(bool); + void setBenchmarkCuDNN(bool /*b*/); int benchmarkLimitCuDNN() const; - void setBenchmarkLimitCuDNN(int); + void setBenchmarkLimitCuDNN(int /*b*/); bool immediateMiopen() const; - void setImmediateMiopen(bool); + void setImmediateMiopen(bool /*b*/); bool deterministicCuDNN() const; - void setDeterministicCuDNN(bool); + void setDeterministicCuDNN(bool /*b*/); bool deterministicMkldnn() const; - void setDeterministicMkldnn(bool); + void setDeterministicMkldnn(bool /*b*/); bool userEnabledNNPACK() const; void setUserEnabledNNPACK(bool e); @@ -236,32 +252,32 @@ class TORCH_API Context { void setSDPPriorityOrder(const std::vector& order); std::array sDPPriorityOrder(); - void setSDPUseFlash(bool); + void setSDPUseFlash(bool /*e*/); bool userEnabledFlashSDP() const; - void setSDPUseMemEfficient(bool); + void setSDPUseMemEfficient(bool /*e*/); bool userEnabledMemEfficientSDP() const; - void setSDPUseMath(bool); + void setSDPUseMath(bool /*e*/); bool userEnabledMathSDP() const; - void setSDPUseCuDNN(bool); + void setSDPUseCuDNN(bool /*e*/); bool userEnabledCuDNNSDP() const; - void setAllowFP16BF16ReductionMathSDP(bool); + void setAllowFP16BF16ReductionMathSDP(bool /*e*/); bool allowFP16BF16ReductionMathSDP() const; - void setSDPUseOverrideable(bool); + void setSDPUseOverrideable(bool /*e*/); bool userEnabledOverrideableSDP() const; at::LinalgBackend linalgPreferredBackend() const; - void setLinalgPreferredBackend(at::LinalgBackend); + void setLinalgPreferredBackend(at::LinalgBackend /*b*/); at::BlasBackend blasPreferredBackend(); - void setBlasPreferredBackend(at::BlasBackend); + void setBlasPreferredBackend(at::BlasBackend /*b*/); at::ROCmFABackend getROCmFAPreferredBackend(); - void setROCmFAPreferredBackend(at::ROCmFABackend); + void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/); // Note [Enabling Deterministic Operations] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -294,9 +310,9 @@ class TORCH_API Context { bool deterministicAlgorithms() const; bool deterministicAlgorithmsWarnOnly() const; - void setDeterministicAlgorithms(bool, bool); + void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/); bool deterministicFillUninitializedMemory() const; - void setDeterministicFillUninitializedMemory(bool); + void setDeterministicFillUninitializedMemory(bool /*b*/); // Note [Writing Nondeterministic Operations] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -310,13 +326,7 @@ class TORCH_API Context { // // * Throw an error when `Context::deterministicAlgorithms()` is true. Most // of the time, this should be accomplished by calling - // `at::globalContext().alertNotDeterminstic()`. However, if the - // nondeterministic behavior is caused by the CuBLAS workspace - // configuration in CUDA >= 10.2, - // `at::globalContext().alertCuBLASConfigNotDeterministic()` should be - // called instead (in this case, a comment explaining why the operation is - // nondeterministic is not necessary). See below for details on these - // methods. + // `at::globalContext().alertNotDeterminstic(). // // * Have an entry in the list of nondeterministic PyTorch operations in the // docstring of `use_deterministic_algorithms()` in torch/__init__.py @@ -340,33 +350,29 @@ class TORCH_API Context { // Throws an error if `Context::deterministicAlgorithms()` is true static void alertNotDeterministic(std::string_view const& caller); - // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA - // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or - // ":4096:8". For more details: - // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility - void alertCuBLASConfigNotDeterministic() const; - void setFloat32MatmulPrecision(const std::string& s); void setFloat32Precision( - const std::string& backend, - const std::string& op, - const std::string& s); - bool allowTF32CuDNN(const std::string& op = std::string()) const; - void setAllowTF32CuDNN(bool); + Float32Backend backend, + Float32Op op, + Float32Precision p); + bool allowTF32CuDNN(std::optional op = std::nullopt) const; + void setAllowTF32CuDNN(bool /*b*/); bool allowTF32OneDNN() const; - void setAllowTF32OneDNN(bool); + void setAllowTF32OneDNN(bool /*b*/); bool allowTF32CuBLAS() const; - void setAllowTF32CuBLAS(bool); + void setAllowTF32CuBLAS(bool /*b*/); Float32MatmulPrecision float32MatmulPrecision() const; - std::string float32Precision( - const std::string& backend, - const std::string& op) const; - bool allowFP16ReductionCuBLAS() const; - void setAllowFP16ReductionCuBLAS(bool); - bool allowBF16ReductionCuBLAS() const; - void setAllowBF16ReductionCuBLAS(bool); + Float32Precision float32Precision(Float32Backend backend, Float32Op op) const; + CuBLASReductionOption allowFP16ReductionCuBLAS() const; + void setAllowFP16ReductionCuBLAS( + bool allow_reduced_precision, + bool allow_splitk = true); + CuBLASReductionOption allowBF16ReductionCuBLAS() const; + void setAllowBF16ReductionCuBLAS( + bool allow_reduced_precision, + bool allow_splitk = true); bool allowFP16AccumulationCuBLAS() const; - void setAllowFP16AccumulationCuBLAS(bool); + void setAllowFP16AccumulationCuBLAS(bool /*b*/); // Matmuls can use a so-called "persistent" kernel which launches one CUDA // block for each SM on the GPU, and each block then iterates over multiple @@ -378,7 +384,7 @@ class TORCH_API Context { // to make matmuls target only a subset of the SMs, so they can fully schedule // even next to a comms kernel, and only be a few percent slower. std::optional _SMCarveout_EXPERIMENTAL() const; - void _setSMCarveout_EXPERIMENTAL(std::optional); + void _setSMCarveout_EXPERIMENTAL(std::optional /*c*/); at::QEngine qEngine() const; void setQEngine(at::QEngine e); @@ -399,7 +405,7 @@ class TORCH_API Context { void setDefaultMobileCPUAllocator(); void unsetDefaultMobileCPUAllocator(); bool allowFP16ReductionCPU() const; - void setAllowFP16ReductionCPU(bool); + void setAllowFP16ReductionCPU(bool /*b*/); // Preserved for BC void lazyInitCUDA() { @@ -429,7 +435,6 @@ class TORCH_API Context { } private: - static bool checkCuBLASConfigDeterministic(); std::array init_; bool enabled_cudnn = true; bool deterministic_cudnn = false; @@ -457,8 +462,10 @@ class TORCH_API Context { : at::Float32MatmulPrecision::HIGHEST; int benchmark_limit_cudnn = 10; bool allow_tf32_cudnn = true; - bool allow_fp16_reduction_cublas = true; - bool allow_bf16_reduction_cublas = true; + CuBLASReductionOption allow_fp16_reduction_cublas = + CuBLASReductionOption::AllowReducedPrecisionWithSplitK; + CuBLASReductionOption allow_bf16_reduction_cublas = + CuBLASReductionOption::AllowReducedPrecisionWithSplitK; bool allow_fp16_accumulation_cublas = false; std::optional sm_carveout = std::nullopt; bool enabled_mkldnn = true; @@ -488,21 +495,20 @@ class TORCH_API Context { bool enable_sparse_tensor_invariant_checks = false; bool allow_fp16_reduction_cpu = false; - std::map> fp32_precision = { - {"generic", {{"all", "none"}}}, - {"mkldnn", - {{"matmul", "none"}, - {"conv", "none"}, - {"rnn", "none"}, - {"all", "none"}}}, - {"cuda", - {{"matmul", - float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST - ? "none" - : "tf32"}, - {"conv", "tf32"}, - {"rnn", "tf32"}, - {"all", "none"}}}, + using Key = std::pair; + std::unordered_map> fp32_precision = { + {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE}, + {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE}, + {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE}, + {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE}, + {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE}, + {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE}, + {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32}, + {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32}, + {{Float32Backend::CUDA, Float32Op::MATMUL}, + float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST + ? Float32Precision::NONE + : Float32Precision::TF32}, }; Allocator* prev_allocator_ptr_{nullptr}; @@ -684,5 +690,4 @@ struct TORCH_API ROCmBackwardPassGuard { ~ROCmBackwardPassGuard(); static bool is_backward_pass(); }; - } // namespace at diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index b16d188b99a5..ccb0ae15a11e 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -389,54 +389,16 @@ void fillVersion( // constructed out of ATen tensor template T* toDLPackImpl(const Tensor& src) { - auto view = src; - - // Detect whether there is need to normalize the strides - // Background: gh-83069 - // - // However, normalizing strides can come at a high-cost - // to slow down toDLPack conversion 3x, so we - // only normalize if needed. - // - // The following code detects whether the src follows - // a continuous pattern. If the src follows such pattern (common-case) - // then we do not need to normalize the strides. - bool need_normalize_strides = false; - int64_t expected_stride = 1; - for (int i = src.dim() - 1; i >= 0; i--) { - // detect if we do not meet continuous pattern - // and the size is 1, so there is opportunity to normalize - if (src.stride(i) != expected_stride && src.size(i) == 1) { - need_normalize_strides = true; - break; - } - expected_stride *= src.size(i); - } - - // less common case, try normalizing the strides - if (need_normalize_strides) { - // create a new tensor with possibly normalized strides - // gh-83069 - auto shape = src.sizes(); - auto strides = src.strides().vec(); - for (int i = 0; i < src.dim(); i++) { - if (shape[i] < 2) { - strides[i] = 1; - } - } - view = src.as_strided(shape, strides, src.storage_offset()); - } - ATenDLMTensor* atDLMTensor(new ATenDLMTensor); - atDLMTensor->handle = view; + atDLMTensor->handle = src; atDLMTensor->tensor.manager_ctx = atDLMTensor; atDLMTensor->tensor.deleter = &deleter; - atDLMTensor->tensor.dl_tensor.data = view.data_ptr(); + atDLMTensor->tensor.dl_tensor.data = src.data_ptr(); atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device()); atDLMTensor->tensor.dl_tensor.ndim = static_cast(src.dim()); atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); - atDLMTensor->tensor.dl_tensor.shape = const_cast(view.sizes().data()); - atDLMTensor->tensor.dl_tensor.strides = const_cast(view.strides().data()); + atDLMTensor->tensor.dl_tensor.shape = const_cast(src.sizes().data()); + atDLMTensor->tensor.dl_tensor.strides = const_cast(src.strides().data()); atDLMTensor->tensor.dl_tensor.byte_offset = 0; fillVersion(&atDLMTensor->tensor); diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h index b1c2eaa2d6ea..928731fafb2f 100644 --- a/aten/src/ATen/DLConvertor.h +++ b/aten/src/ATen/DLConvertor.h @@ -52,16 +52,16 @@ struct DLPackTraits {}; template <> struct DLPackTraits { - inline static const char* capsule = "dltensor"; - inline static const char* used = "used_dltensor"; + inline static constexpr const char* capsule = "dltensor"; + inline static constexpr const char* used = "used_dltensor"; inline static auto toDLPack = at::toDLPack; inline static auto fromDLPack = at::fromDLPack; }; template <> struct DLPackTraits { - inline static const char* capsule = "dltensor_versioned"; - inline static const char* used = "used_dltensor_versioned"; + inline static constexpr const char* capsule = "dltensor_versioned"; + inline static constexpr const char* used = "used_dltensor_versioned"; inline static auto toDLPack = at::toDLPackVersioned; inline static auto fromDLPack = at::fromDLPackVersioned; }; diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index e34be30f9607..ac76d09537fa 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef size) { inline void check_size_nonnegative(ArrayRef size) { for (const auto& x : size) { - TORCH_CHECK( - x.expect_size(__FILE__, __LINE__), + TORCH_SYM_CHECK( + x.sym_ge(0), "Trying to create tensor with negative dimension ", x, ": ", diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 090699339ccf..1bf46ebe61b6 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -468,7 +468,7 @@ inline Tensor _sum_to( // if we assume no reduction due to unbacked we ensure that at runtime. TORCH_MAYBE_SYM_CHECK( sym_eq(shape[i - leading_dims], sizes[i]), - "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:", + "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:", shape[i - leading_dims], ", ", sizes[i]) diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index 2cf8d9727f65..9631872875c6 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -9,11 +9,6 @@ namespace at::functionalization { -ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { - if (out_idx == this->out_index) return *this; - return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx); -} - // Note [Functionalization: Alias Removal Part 2] // See Note [Functionalization: Alias Removal] for more details. // This function applies a single update from one of the views to the StorageImpl. @@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { at::Tensor t = update.new_val; TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - if (update.view_metas.empty()) return t; + if (update.view_metas.empty()) { return t; } std::vector tmp_values({base}); tmp_values.reserve(update.view_metas.size()); for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { - at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index); + at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided // All of these ops require additional information to recover the sizes of the original tensor. // If need to, we could probably apply this optimization and only bother computing tmp_values @@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co tmp_values.push_back(std::move(next_view)); } for(int64_t i = static_cast(update.view_metas.size()) - 1; i >= 0; --i) { - int64_t out_idx = update.view_metas[i].out_index; // Each view inverse is implemented in ViewInverses.cpp. - t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx); + t = update.view_metas[i]->reverse(tmp_values[i], t); } TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); return t; @@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); } -void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector& metas) { +void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector>& metas) { TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); if (metas.size() > 1) { for (size_t i = 1; i < metas.size(); ++i) { // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI - TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided, + TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h index 8cd1cb7434aa..0c9c1fd775f3 100644 --- a/aten/src/ATen/FunctionalStorageImpl.h +++ b/aten/src/ATen/FunctionalStorageImpl.h @@ -8,44 +8,89 @@ namespace at::functionalization { // See Note [Functionalization Pass In Core] +enum class InverseReturnMode { + /// Specifies that functional inverses should always return a view. + AlwaysView, + /// Specifies that functional inverses should always return a non-view / copy. + NeverView, + /// Specifies that functional inverses should return a view unless a (copying) + /// scatter + /// inverse exists, in which case that will be used instead. + /// This avoids as_strided() calls that can be difficult for subclasses to + /// handle. + ViewOrScatterInverse, +}; + +#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \ + static const char* name() { \ + return #TYPE; \ + } + +#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \ + using SerializableTuple = std::tuple<__VA_ARGS__> + // ViewMeta is a class used by the functionalization pass to navigate between // a base tensor and a view tensor. // For example, if I call `b = a.view1(...)` -// the functionalization pass will generate and store a ViewMeta on b that looks -// like: +// the functionalization pass will generate and store a ViewMeta specialization +// for `view1` operation on b that looks like: // -// ViewMeta( -// [](const Tensor& base, int64_t mutated_view_idx) { -// return base.view1(...); -// }, -// [](const at::Tensor& base, const at::Tensor& mutated_view, -// int64_t mutated_view_idx) -> at::Tensor { -// return at::functionalization::impl::view1_inverse(base, mutated_view, -// ...); +// struct TORCH_API view1_ViewMeta : public ViewMeta { +// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta); +// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( +// bool /* reapply_views */, +// const std::vector&); +// +// view1_ViewMeta(const SerializableTuple& tpl) +// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} +// +// view1_ViewMeta(bool reapply_views, const std::vector& size) +// : ViewMeta(/*has_symbolic_inputs=*/false), +// reapply_views(reapply_views), +// size(size) {} +// +// Tensor forward(const Tensor& base) override { +// return base.view1(...); // } // -// The forward_fn lambda describes how to replay view1 on a tensor. +// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override { +// return at::functionalization::impl::view1_inverse(base, mutated_view, +// ...); +// } // -// The reverse_fn lambda describes how, given a tensor that is already a view, +// SerializableTuple to_serializable_tuple() { +// return std::make_tuple(reapply_views, size); +// } +// +// bool reapply_views; +// std::vector size; +// }; +// +// The forward function describes how to replay view1 on a tensor. +// +// The reverse function describes how, given a tensor that is already a view, // how to get the corresponding base tensor. See Note [Functionalization Pass: // View Inverses] for details. +// +// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type +// representing the `ViewMeta` instance state. Methods that take in/return such +// a type are used for supporting pickle serialization. struct ViewMeta { ViewMeta( - std::function forward, - std::function reverse, bool has_symbolic_inputs, bool is_multi_output = false, bool is_as_strided = false, int64_t out_idx = 0) - : forward_fn(std::move(forward)), - reverse_fn(std::move(reverse)), - out_index(out_idx), + : out_index(out_idx), is_multi_output(is_multi_output), is_as_strided(is_as_strided), has_symbolic_inputs(has_symbolic_inputs) {} - std::function forward_fn; - std::function reverse_fn; + virtual ~ViewMeta() = default; + + virtual Tensor forward(const Tensor& base) = 0; + virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0; + // See Note [out_idx in ViewMeta] int64_t out_index; @@ -57,10 +102,17 @@ struct ViewMeta { // Tells us if this view operation has any symbolic inputs bool has_symbolic_inputs; - // Returns a copy of the current ViewMeta, if out_idx matches the current - // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse + // Returns a new ViewMeta with the same forward/reverse // functions, but a new out index. - ViewMeta to_out_idx(int64_t out_idx); + // + // This method should be implemented by those `ViewMeta` that have more than + // one output. + virtual std::shared_ptr to_out_index(int64_t out_index) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "ViewMeta::to_out_index not implemented. ", + "Likely because there's only one output."); + } }; // FunctionalStorageImpl is a subclass of StorageImpl used by the @@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const at::Tensor new_val; // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) - const std::vector view_metas; + const std::vector> view_metas; }; explicit FunctionalStorageImpl(const Tensor& value); void add_update( const Tensor& updated_val, - const std::vector& view_metas); + const std::vector>& view_metas); bool apply_updates(); const Tensor& base() { return base_; diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 7d5e4e84e861..d553cc1fb949 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const { // - view_value: The output tensor that we need to wrap. // - base: The "base" of the view that `view_value` was generated from. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. -FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta) - : c10::TensorImpl( - c10::DispatchKeySet(DispatchKey::Functionalize), - view_value.dtype(), - view_value.device() - ), - value_(view_value), - is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output), - was_storage_changed_(base->was_storage_changed_), - is_symbolic_(base->is_symbolic_) -{ +FunctionalTensorWrapper::FunctionalTensorWrapper( + const Tensor& view_value, + const FunctionalTensorWrapper* base, + const std::shared_ptr& meta) + : c10::TensorImpl( + c10::DispatchKeySet(DispatchKey::Functionalize), + view_value.dtype(), + base->storage().data_ptr().device()), + value_(view_value), + is_multi_output_view_( + base->is_multi_output_view_ || meta->is_multi_output), + was_storage_changed_(base->was_storage_changed_), + is_symbolic_(base->is_symbolic_) { TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); set_constructor_metadata(); @@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const view_metas_ = base->view_metas_; // copy } view_metas_.push_back(meta); - maybe_mark_symbolic(meta); + maybe_mark_symbolic(meta.get()); storage_ = base->storage_; // alias this tensor's storage with the base tensor's } - functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { return static_cast(storage_.unsafeGetStorageImpl()); } @@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const { } // See Note [Functionalization Pass - Inplace View Ops] -void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) { +void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr& meta) { view_metas_.push_back(meta); // Manually track the fact that this tensor received a metadata mutation! has_metadata_mutation_ = true; // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. - maybe_mark_symbolic(meta); + maybe_mark_symbolic(meta.get()); // Note [Functionalization Pass - Inplace View Ops] // So, these ops are special - they're mutation AND view ops. They get special codegen. // An example is transpose_, e.g. `a.transpose_()` // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. at::AutoDispatchSkipFunctionalize guard; - value_ = meta.forward_fn(value_, meta.out_index); + value_ = meta->forward(value_); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); } @@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() { regenerate_from_base(); } -Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) { - auto t = base; - - // Reapply views to get the viewed tensor from the base in alias_ - for (auto& view_meta: view_metas_) { - t = view_meta.forward_fn(t, view_meta.out_index); - } - - return t; +const std::vector>& FunctionalTensorWrapper::view_metas() const { + return view_metas_; } void FunctionalTensorWrapper::regenerate_from_base() { @@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() { auto t = storage_impl->base(); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - t = apply_view_metas(t); + t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); replace_(t, /*from_lazy_regenerate=*/true); @@ -485,7 +479,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptrdevice(); + // The storage pointer already uses the underlying tensor custom device (if + // applicable) to extract the device. So, we dont have to recurse again by + // doing value_.unsafeGetTensorImpl()->device(). + return storage().data_ptr().device(); } at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const { return value_.unsafeGetTensorImpl()->sizes(); @@ -724,11 +721,11 @@ bool isFunctionalTensor(const std::optional& t) { } bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { - if (t_list.empty()) return false; + if (t_list.empty()) { return false; } auto functional_count = 0; for (const auto i : c10::irange(t_list.size())) { auto const & e= t_list[i]; - if (!e.has_value() || !e->defined()) continue; + if (!e.has_value() || !e->defined()) { continue; } if (isFunctionalTensor(e)) { ++functional_count; } @@ -738,10 +735,10 @@ bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { template static bool isFunctionalTensorIListRef(c10::IListRef list) { - if (list.size() == 0) return false; + if (list.size() == 0) { return false; } auto functional_count = 0; for (const auto& tensor : list) { - if (!tensor.defined()) continue; + if (!tensor.defined()) { continue; } if (isFunctionalTensor(tensor)) { ++functional_count; } @@ -759,20 +756,28 @@ void freeze_functional_tensor(const Tensor& tensor) { functional_base_impl->freeze_storage(); } -Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) { +Tensor create_functional_tensor_with_view_meta( + const at::Tensor& view_to_wrap, + const at::Tensor& base, + const std::shared_ptr& meta, + int64_t out_idx) { TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); + auto meta_ = meta; if (out_idx != 0) { // Note [out_idx in ViewMeta] // When a view op outputs multiple tensors, each output needs its own separate ViewMeta. // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. - meta = meta.to_out_idx(out_idx); + meta_ = meta->to_out_index(out_idx); } - return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta); + return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta_); } -std::vector create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) { +std::vector create_functional_tensor_with_view_meta( + ITensorListRef view_to_wrap, + const at::Tensor& base, + const std::shared_ptr& meta) { std::vector outputs(view_to_wrap.size()); int64_t i = 0; for (const auto& tensor : view_to_wrap) { @@ -782,12 +787,22 @@ std::vector create_functional_tensor_with_view_meta(ITensorListRef view_ return outputs; } -void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) { +void mutate_view_meta(const at::Tensor& self, const std::shared_ptr& meta) { TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); self_impl->mutate_view_meta(meta); } +Tensor apply_view_meta_sequence( + const Tensor& base, + const std::vector>& sequence) { + Tensor r = base; + for (auto& vm : sequence) { + r = vm->forward(r); + } + return r; +} + // Note [Propagating strides in the functionalization pass] // In order to properly compute stride information, the functionalization pass // calls each {view} reference implementations with meta tensors. @@ -881,7 +896,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s const auto& ivalue = returns[idx]; if (ivalue.isTensor()) { const auto& t = ivalue.toTensor(); - if (!t.defined()) continue; + if (!t.defined()) { continue; } at::functionalization::impl::sync(t); auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index b260b7c9f958..6d9050728da7 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { explicit FunctionalTensorWrapper( const Tensor& view_value, const FunctionalTensorWrapper* base, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); // Get the underlying, actual tensor, that doesn't know anything about // functionalization. @@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { ->are_all_mutations_under_no_grad_or_inference_mode(); } - void maybe_mark_symbolic(const functionalization::ViewMeta& meta) { - is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs; + void maybe_mark_symbolic(functionalization::ViewMeta* meta) { + is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; } bool is_symbolic() const { return is_symbolic_; } - // Runs the forward_fn of every ViewMeta collected in the current instance - // to some other base. - Tensor apply_view_metas(const Tensor& base); + // Retrieves the ViewMeta sequence of this tensor. + const std::vector>& view_metas() + const; // Sync's the underlying tensor with its alias, if it's out of date. This // involves two steps: 1) Apply any pending updates/mutations to the alias 2) @@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // from the base tensor. This method is used by inplace-view ops like // transpose_. It appends a ViewMeta to the existing stack, and refreshes the // tensor by replaying the views off of the alias. - void mutate_view_meta(const at::functionalization::ViewMeta& meta); + void mutate_view_meta( + const std::shared_ptr& meta); // Custom implementation of self.set_(src) void set__impl(const FunctionalTensorWrapper* other); @@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { bool is_symbolic_ = false; size_t generation_ = 0; - std::vector view_metas_; + std::vector> view_metas_; protected: static void copy_tensor_metadata( @@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct( Tensor create_functional_tensor_with_view_meta( const Tensor& view_to_wrap, const Tensor& base, - functionalization::ViewMeta meta, + const std::shared_ptr& meta, int64_t out_idx = 0); std::vector create_functional_tensor_with_view_meta( ITensorListRef view_to_wrap, const Tensor& base, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); void mutate_view_meta( const Tensor& self, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); + +TORCH_API Tensor apply_view_meta_sequence( + const Tensor& base, + const std::vector>& sequence); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset( diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 97094c9f125a..10f988b4d281 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -7,7 +9,6 @@ #include #include #include -#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -28,6 +29,31 @@ #include #endif +namespace at::functionalization { + +Tensor resize__ViewMeta::forward(const Tensor& base) { + if (reapply_views) { + return base.as_strided(size, c10::contiguous_strides(size)); + } else { + return at::as_strided_copy(base, size, c10::contiguous_strides(size)); + } +} + +Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { + return base.as_strided_scatter( + mutated_view, size, c10::contiguous_strides(size)); +} + +Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) { + return at::_unsafe_view_symint(base, size); +} + +Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { + return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); +} + +} // namespace at::functionalization + namespace { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { const auto& schema = op.schema(); @@ -106,7 +132,9 @@ namespace { const auto& ivalue = returns[idx]; if (ivalue.isTensor() && should_wrap_outputs) { const auto& t = ivalue.toTensor(); - if (!t.defined()) continue; + if (!t.defined()) { + continue; + } auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; } else if (ivalue.isTensorList() && should_wrap_outputs) { @@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch // The output of resizing is equivalent to taking a slice of a larger tensor. // We have to emulate this "slicing" with an as_strided call. auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - if (reapply_views) { - return base.as_strided(size, c10::contiguous_strides(size)); - } else { - return at::as_strided_copy(base, size, c10::contiguous_strides(size)); - } - }, - [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size)); - }, - /*has_symbolic_inputs=*/false - ); + auto view_meta = std::make_shared( + reapply_views, size.vec()); at::functionalization::impl::mutate_view_meta(self, view_meta); return self; } @@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt tmp_output = at::_unsafe_view_symint(self_, size); } - bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); - - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return at::_unsafe_view_symint(base, size); - }, - [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); - }, - /*has_symbolic_inputs=*/has_symbolic_inputs - ); + bool has_symbolic_inputs = std::any_of( + size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); + auto view_meta = + std::make_shared( + has_symbolic_inputs, size.vec()); auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); // See Note [Propagating strides in the functionalization pass] diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.h b/aten/src/ATen/FunctionalizeFallbackKernel.h new file mode 100644 index 000000000000..aabcfc827af3 --- /dev/null +++ b/aten/src/ATen/FunctionalizeFallbackKernel.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace at::functionalization { + +// `ViewMeta` implementation for `resize_` operation. +struct TORCH_API resize__ViewMeta : public ViewMeta { + FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta) + FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( + bool /* reapply_views */, + const std::vector&); + + resize__ViewMeta(const SerializableTuple& tpl) + : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} + + resize__ViewMeta(bool reapply_views, const std::vector& size) + : ViewMeta(/*has_symbolic_inputs=*/false), + reapply_views(reapply_views), + size(size) {} + + Tensor forward(const Tensor& base) override; + Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; + + SerializableTuple to_serializable_tuple() { + return std::make_tuple(reapply_views, size); + } + + bool reapply_views; + std::vector size; +}; + +// `ViewMeta` implementation for `_unsafe_view` operation. +struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta { + FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta) + FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( + bool /* has_symbolic_inputs */, + const std::vector&); + + _unsafe_view_ViewMeta(const SerializableTuple& tpl) + : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} + + _unsafe_view_ViewMeta( + bool has_symbolic_inputs, + const std::vector& size) + : ViewMeta(has_symbolic_inputs), size(size) {} + + Tensor forward(const Tensor& base) override; + Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; + + SerializableTuple to_serializable_tuple() { + return std::make_tuple(has_symbolic_inputs, size); + } + + std::vector size; +}; + +} // namespace at::functionalization diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index e701882a2606..817bf0ddba0b 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,7 @@ inline void infer_size_impl( std::optional infer_dim; for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) { if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) { - if (infer_dim) { - throw std::runtime_error("only one dimension can be inferred"); - } + TORCH_CHECK(!infer_dim, "only one dimension can be inferred"); infer_dim = dim; } else { // in case of unbacked shape[dim] we assume it's not -1 and add a runtime @@ -45,7 +44,39 @@ inline void infer_size_impl( } } - auto set_infer_dim = [&]() { + if (infer_dim) { + // numel is the product of known sizes, it has to be divisible by newsize. + // and newsize should be positive unless newsize == numel (we throw + // different) error message in that case. + if constexpr (std::is_same_v) { + auto v = newsize.maybe_as_int(); + if (v and *v == 0) { + // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed! + // which may happen when newsize is not a symbol! if its a symbol + // division won't happen anyway during compile. + TORCH_MAYBE_SYM_CHECK( + numel == newsize, + "shape '", + shape, + "' is invalid for input of size ", + numel); + } else { + auto cond = sym_gt(newsize, 0) + .sym_and(sym_eq(numel % newsize, 0)) + .sym_or(sym_eq(numel, newsize)); + TORCH_MAYBE_SYM_CHECK( + cond, "shape '", shape, "' is invalid for input of size ", numel); + } + + } else { + TORCH_CHECK( + (newsize > 0 && (numel % newsize == 0)) || numel == newsize, + "shape '", + shape, + "' is invalid for input of size ", + numel); + } + // We have a degree of freedom here to select the dimension size; follow // NumPy semantics and just bail. However, a nice error message is needed // because users often use `view` as a way to flatten & unflatten @@ -54,18 +85,14 @@ inline void infer_size_impl( // works yet // empty_tensor.view(-1, 0) // doesn't. - TORCH_CHECK( + TORCH_MAYBE_SYM_CHECK( newsize != 0, "cannot reshape tensor of 0 elements into shape ", shape, " because the unspecified dimension size -1 can be any " "value and is ambiguous"); - res[*infer_dim] = numel / newsize; - return; - }; - if (infer_dim && newsize > 0 && numel % newsize == 0) { - set_infer_dim(); + res[*infer_dim] = numel / newsize; return; } @@ -75,9 +102,6 @@ inline void infer_size_impl( shape, "' is invalid for input of size ", numel); - if (infer_dim) { - set_infer_dim(); - } } inline std::vector infer_size(IntArrayRef shape, int64_t numel) { diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp index 4c8c07f84e96..2c54718e938f 100644 --- a/aten/src/ATen/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp @@ -58,7 +58,7 @@ namespace at { namespace{ // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor. -static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +bool is_allowed_dim_on_scalar_tensor(int64_t dim) { return dim == 0 || dim == -1; } @@ -365,7 +365,7 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) { return self_physical.getPhysicalToLogicalMap().apply(result); } -static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) { +int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) { return maybe_wrap_dim(dim, static_cast(input_sizes.size())) + num_batch_dims; } @@ -488,7 +488,7 @@ Tensor view_as_complex_batching_rule(const Tensor& self) { // Checks that the smallest batch stride is greater than the largest example // stride. This is something we can support but we choose not to because it's // potentially error prone. -static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) { +void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) { auto smallest_batch_stride = std::min_element( physical_strides.begin(), physical_strides.begin() + num_batch_dims); auto largest_example_stride = std::max_element( @@ -508,7 +508,7 @@ static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t // given (sizes, strides, storage_offset) returns the maximum location that // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors // with zero-size dims). -static std::optional maximum_indexable_location( +std::optional maximum_indexable_location( IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) { auto result = native::storage_size_for(sizes, strides); if (result == 0) { @@ -521,7 +521,7 @@ static std::optional maximum_indexable_location( // This checks that the range of possible memory locations accessible by // x.as_strided(sizes, strides, maybe_storage_offset) // are within the bounds of possible memory locations accessible by x. -static void checkBasicAsStridedValidForSlice( +void checkBasicAsStridedValidForSlice( const Tensor& physical_tensor, int64_t num_batch_dims, IntArrayRef sizes, diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp index 63a278050e8a..ed697c32b58a 100644 --- a/aten/src/ATen/MapAllocator.cpp +++ b/aten/src/ATen/MapAllocator.cpp @@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified"; #endif } // namespace (anonymous) -MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size) +MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size) : filename_(filename.empty() ? unknown_filename : filename) , size_(0) // to be filled later #ifdef _WIN32 @@ -292,6 +292,28 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, if (ftruncate(fd, static_cast(size)) == -1) { TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")"); } + +#ifdef HAVE_POSIX_FALLOCATE + if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) { + for (;;) { + if (posix_fallocate(fd, 0, static_cast(size)) == 0) { + break; + } + + if (errno == EINTR) { + continue; + } + + if (errno == EINVAL || errno == EOPNOTSUPP) { + // the underlying filesystem does not support the operation + break; + } + + TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")"); + } + } +#endif + if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast(size)) { #ifndef STRIP_ERROR_MESSAGES int last_err = errno; @@ -472,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags, initializeAlloc(); } -RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size) +RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size) : RefcountedMapAllocatorArgCheck(flags) , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) { @@ -592,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU}; } -at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size(); return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU}; @@ -604,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags, return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU}; } -at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { +at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) { auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size); if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment; return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU}; diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h index 9fc5e32adcb5..7a3415a4c411 100644 --- a/aten/src/ATen/MapAllocator.h +++ b/aten/src/ATen/MapAllocator.h @@ -25,7 +25,7 @@ class TORCH_API MapAllocator { public: MapAllocator(std::string_view filename, int flags, size_t size); MapAllocator( - WithFd, + WithFd /*unused*/, std::string_view filename, int fd, int flags, @@ -59,14 +59,14 @@ class TORCH_API MapAllocator { return flags_; } - static MapAllocator* fromDataPtr(const at::DataPtr&); + static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/); static at::DataPtr makeDataPtr( std::string_view filename, int flags, size_t size, size_t* actual_size_out); static at::DataPtr makeDataPtr( - WithFd, + WithFd /*unused*/, const char* filename, int fd, int flags, @@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, public: RefcountedMapAllocator(const char* filename, int flags, size_t size); RefcountedMapAllocator( - WithFd, + WithFd /*unused*/, const char* filename, int fd, int flags, size_t size); - static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&); + static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/); RefcountedMapAllocator(const RefcountedMapAllocator&) = delete; RefcountedMapAllocator(RefcountedMapAllocator&&) = delete; RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete; @@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, size_t size, size_t* actual_size_out); static at::DataPtr makeDataPtr( - WithFd, + WithFd /*unused*/, const char* filename, int fd, int flags, diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 09fbedd4056d..2de73a70dd33 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef return; } const auto src_names = src.names(); - const auto result_dim = static_cast(result.dim()); + const auto result_dim = result.dim(); const auto src_dim = static_cast(src_names.size()); const auto excluded_dim = static_cast(excluded_idxs.size()); TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim); diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index 63bd867f9022..ea951ed3db13 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const { return NestedTensorImpl::numel_custom(); } -c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const { +c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const { return nested_tensor_impl_is_contiguous(this); } IntArrayRef NestedTensorImpl::sizes_custom() const { diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h index cddf37df34a5..9b92e9ec83ad 100644 --- a/aten/src/ATen/NestedTensorImpl.h +++ b/aten/src/ATen/NestedTensorImpl.h @@ -115,7 +115,8 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { // with real implementations int64_t numel_custom() const override; c10::SymInt sym_numel_custom() const override; - c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override; + c10::SymBool sym_is_contiguous_custom( + MemoryFormat /*memory_format*/) const override; int64_t size_custom(int64_t d) const override { return this->size(d); } diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h index b55dad02f347..d09a33841b94 100644 --- a/aten/src/ATen/Parallel.h +++ b/aten/src/ATen/Parallel.h @@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) { TORCH_API void init_num_threads(); // Sets the number of threads to be used in parallel region -TORCH_API void set_num_threads(int); +TORCH_API void set_num_threads(int /*nthreads*/); // Returns the maximum number of threads that may be used in a parallel region TORCH_API int get_num_threads(); @@ -37,7 +37,7 @@ inline void lazy_init_num_threads() { } } -TORCH_API void set_thread_num(int); +TORCH_API void set_thread_num(int /*id*/); class TORCH_API ThreadIdGuard { public: @@ -130,7 +130,7 @@ inline scalar_t parallel_reduce( TORCH_API std::string get_parallel_info(); // Sets number of threads used for inter-op parallelism -TORCH_API void set_num_interop_threads(int); +TORCH_API void set_num_interop_threads(int /*nthreads*/); // Returns the number of threads used for inter-op parallelism TORCH_API size_t get_num_interop_threads(); diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp index e4105bf8468f..e90065543e35 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.cpp +++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp @@ -42,8 +42,14 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() { } bool torch_function_mode_enabled() { - return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED && - PythonTorchFunctionTLS::stack_len() > 0; + // Manually flatten because gcc is refusing to inline here. Note + // that we are still calling __tls_get_addr twice here with GCC, + // presumably because of + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81501 (which says + // the fix ships in GCC 16), but forcing inlining still improves + // performance. + const auto& ptfs = pythonTorchFunctionState; + return ptfs.disabled_state_ != TorchFunctionDisabledState::ALL_DISABLED && !ptfs.stack_.empty(); } // This is needed to disambiguate the ternary torch function disabled states diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h index a245a55ebdc4..502bb535be05 100644 --- a/aten/src/ATen/PythonTorchFunctionTLS.h +++ b/aten/src/ATen/PythonTorchFunctionTLS.h @@ -27,6 +27,7 @@ struct TORCH_API PythonTorchFunctionTLS { TorchFunctionDisabledState disabled_state_ = TorchFunctionDisabledState::ENABLED; std::vector> stack_; + friend TORCH_API bool torch_function_mode_enabled(); }; TORCH_API bool torch_function_mode_enabled(); diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp index e05e3145fdf3..69d0c243156f 100644 --- a/aten/src/ATen/SavedTensorHooks.cpp +++ b/aten/src/ATen/SavedTensorHooks.cpp @@ -13,7 +13,7 @@ namespace { // and left at true for the rest of the execution. // It's an optimization so that users who never use default hooks don't need to // read the thread_local variables pack_hook_ and unpack_hook_. - static bool is_initialized(false); + bool is_initialized(false); } static void assertSavedTensorHooksNotDisabled() { diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp index f73d75ab53ad..dec6d2e95960 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.cpp +++ b/aten/src/ATen/SparseCsrTensorImpl.cpp @@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) { void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) { TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset."); } -c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const { +c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const { TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous"); } } // namespace at diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h index 14688163a374..e764f954db33 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.h +++ b/aten/src/ATen/SparseCsrTensorImpl.h @@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { public: explicit SparseCsrTensorImpl( - at::DispatchKeySet, + at::DispatchKeySet /*key_set*/, at::Device device, Layout layout, - const caffe2::TypeMeta); + const caffe2::TypeMeta /*data_type*/); void resize_(int64_t nnz, IntArrayRef size); void resize_and_clear_( @@ -86,7 +86,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { protected: IntArrayRef strides_custom() const override; SymIntArrayRef sym_strides_custom() const override; - SymBool sym_is_contiguous_custom(MemoryFormat) const override; + SymBool sym_is_contiguous_custom( + MemoryFormat /*memory_format*/) const override; public: void set_size(int64_t dim, int64_t new_size) override; diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index b10795fbc37e..a2c12fcfe8b9 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -46,7 +46,9 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { public: // Public for now... - explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta); + explicit SparseTensorImpl( + at::DispatchKeySet /*key_set*/, + const caffe2::TypeMeta /*data_type*/); void release_resources() override; @@ -229,14 +231,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { } void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { - return _resize_(sparse_dim, dense_dim, size); + _resize_(sparse_dim, dense_dim, size); } void resize_( int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { - return _resize_(sparse_dim, dense_dim, size); + _resize_(sparse_dim, dense_dim, size); } // NOTE: this function will resize the sparse tensor and also set `indices` @@ -384,8 +386,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { private: explicit SparseTensorImpl( - at::DispatchKeySet, - const caffe2::TypeMeta, + at::DispatchKeySet /*key_set*/, + const caffe2::TypeMeta /*data_type*/, at::Tensor indices, at::Tensor values); diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp index bd50282b46ec..1fa852686656 100644 --- a/aten/src/ATen/TensorIndexing.cpp +++ b/aten/src/ATen/TensorIndexing.cpp @@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef indices, c } } - return set_item(self, indices, value); + set_item(self, indices, value); } } // namespace indexing diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index a487589833e8..9291d2e66e5f 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice); // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})` struct TORCH_API TensorIndex final { // Case 1: `at::indexing::None` - TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {} + TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {} // Case 2: "..." / `at::indexing::Ellipsis` - TensorIndex(at::indexing::EllipsisIndexType) + TensorIndex(at::indexing::EllipsisIndexType /*unused*/) : type_(TensorIndexType::Ellipsis) {} TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) { TORCH_CHECK_VALUE( @@ -214,7 +214,7 @@ inline Tensor applySlice( "step must be greater than zero"); // See NOTE [nested tensor size for indexing] - if (self_sizes.has_value() && self_sizes.value().size() > 0) { + if (self_sizes.has_value() && !self_sizes.value().empty()) { // Skip this optimization if we are tracing, as the trace may be polymorphic // over the shape of the `self` tensor, and we still want to record // the slice. diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 9096cbfc68eb..d0bbe2d76548 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -56,7 +56,7 @@ inline void get_strides(int64_t* strides, ArrayRef operands, int64_ } } -static OptionalTensorRef make_otr(const TensorBase &tensor) { +OptionalTensorRef make_otr(const TensorBase &tensor) { if (tensor.defined()) { return OptionalTensorRef(tensor); } else { @@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) { if (numel == 0) { return; } else if (numel < grain_size || at::get_num_threads() == 1) { - return serial_for_each(loop, {0, numel}); + serial_for_each(loop, {0, numel}); + return; } else { at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) { serial_for_each(loop, {begin, end}); diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index d8eebd4c06a4..d8593a80292b 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { using PtrVector = SmallVector; using StrideVector = SmallVector; - void build(TensorIteratorConfig&); + void build(TensorIteratorConfig& /*config*/); // The inner-loop function operates on the fastest moving dimension. It // implements element-wise operations in terms of 1-d strided tensors. @@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { #undef TORCH_DISALLOW_TEMPORARIES protected: // Mutable reference as it moves tensors out of TensorIteratorConfig - void populate_operands(TensorIteratorConfig&); + void populate_operands(TensorIteratorConfig& /*config*/); void mark_outputs(); - void mark_resize_outputs(const TensorIteratorConfig&); - void compute_mem_overlaps(const TensorIteratorConfig&); - void compute_shape(const TensorIteratorConfig&); - void compute_strides(const TensorIteratorConfig&); + void mark_resize_outputs(const TensorIteratorConfig& /*config*/); + void compute_mem_overlaps(const TensorIteratorConfig& /*config*/); + void compute_shape(const TensorIteratorConfig& /*config*/); + void compute_strides(const TensorIteratorConfig& /*config*/); void reorder_dimensions(); void permute_dimensions(IntArrayRef perm); - void compute_types(const TensorIteratorConfig&); + void compute_types(const TensorIteratorConfig& /*config*/); ScalarType compute_common_dtype(); void allocate_or_resize_outputs(); - bool fast_set_up(const TensorIteratorConfig&); - FastSetupType compute_fast_setup_type(const TensorIteratorConfig&); - void compute_names(const TensorIteratorConfig&); + bool fast_set_up(const TensorIteratorConfig& /*config*/); + FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/); + void compute_names(const TensorIteratorConfig& /*config*/); void propagate_names_to_outputs(); void coalesce_dimensions(); diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 34cb5329de6a..8236751679f0 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef tensors, at::Layout layout) } void * maybe_data_ptr(const Tensor& tensor) { - return tensor.defined() ? (void *)tensor.data_ptr() : nullptr; + return tensor.defined() ? tensor.data_ptr() : nullptr; } void * maybe_data_ptr(const TensorArg& tensor) { - return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; + return tensor->defined() ? tensor->data_ptr() : nullptr; } void check_dim_size( diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index 95a35bd5563a..e9c936b906c6 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -20,7 +20,7 @@ namespace at { -TORCH_API int _crash_if_asan(int); +TORCH_API int _crash_if_asan(int /*arg*/); // Converts a TensorList (i.e. ArrayRef to vector of TensorImpl*) // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat. diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 4b8b5f6c5d18..e3424cc4cb8e 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -36,7 +36,7 @@ namespace { using weakref_type = c10::weak_intrusive_ptr; using val_type = std::tuple; -static ska::flat_hash_map& get_cached_casts() { +ska::flat_hash_map& get_cached_casts() { static ska::flat_hash_map cached_casts; return cached_casts; } @@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_ Banned functions *******************************/ -static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional&, int64_t) { +static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional& /*unused*/, int64_t /*unused*/) { TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n" "Many models use a sigmoid layer right before the binary cross entropy layer.\n" "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n" diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp index 5939253caf55..f3ddaedc5ecd 100644 --- a/aten/src/ATen/core/CachingHostAllocator.cpp +++ b/aten/src/ATen/core/CachingHostAllocator.cpp @@ -6,9 +6,9 @@ namespace at { namespace { -static std::array +std::array allocator_array{}; -static std::array +std::array allocator_priority{}; } // anonymous namespace diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h index 53e95cd2d4cf..c9eacbed42ef 100644 --- a/aten/src/ATen/core/CachingHostAllocator.h +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -49,19 +50,57 @@ namespace { constexpr size_t MAX_SIZE_INDEX = 64; } +// A large reserved pinned memory segment that is created in advance which is used +// to allocate small pinned memory requests to avoid calling into expensive APIs. +// We never free this memory and move up the pointer as we allocate new blocks +// and when blocks are freed, they are cached in the free lists. +struct PinnedReserveSegment { + PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size), + current_ptr_(start_), initialized_(true) {} + + PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {} + + bool initialized() { + return initialized_; + } + + void* allocate(size_t bytes) { + std::lock_guard guard(mutex_); + + // Round up the requested size to 4KB boundary for all including the small ones. + size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1); + + if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) { + return nullptr; + } + + void* ptr = current_ptr_; + current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes; + return ptr; + } + + bool owns(void* ptr) { + return ptr >= start_ && ptr < (uint8_t*)start_ + size_; + } + + std::mutex mutex_; + void* start_; + size_t size_; + void* current_ptr_; + bool initialized_; +}; + // Struct containing memory allocator summary statistics for host. struct TORCH_API HostStats { - // COUNT: allocations requested by client code. Note that active - // count can be extracted by looking at current allocations - Stat allocation; - // COUNT: number of allocated segments from host memory allocation. - Stat segment; - - // SUM: bytes allocated by this memory alocator. Note that active bytes - // can be extracted by looking at current bytes allocated + // COUNT: total allocations (active) + Stat active_requests; + // SUM: bytes allocated/reserved by this memory alocator. (active) + Stat active_bytes; + // COUNT: total allocations (active + free) + Stat allocations; + // SUM: bytes allocated/reserved by this memory alocator. This accounts + // for both free and in-use blocks. Stat allocated_bytes; - // SUM: bytes reserved by this memory allocator (both free and used) - Stat reserved_bytes; // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds DurationStat host_alloc_time; @@ -75,6 +114,9 @@ struct TORCH_API HostStats { // COUNT: number of times cudaHostFree/cudaHostUnregister was called. int64_t num_host_free = 0; // This is derived from segment or timing + + // Count of cudaHostAlloc/cudaHostRegister per bucket + std::vector bucket_allocation = std::vector(MAX_SIZE_INDEX); }; // Struct containing memory allocator summary statistics for host, as they @@ -82,17 +124,22 @@ struct TORCH_API HostStats { // avoid locking the allocator while collecting stats. struct alignas(64) HostStatsStaged { std::mutex timing_mutex_; - // COUNT: allocations requested by client code resulting in a new segment/block allocation - // LOCK: access to this stat is protected by the allocator's blocks_mutex_ - Stat allocation; - // SUM: bytes within active memory blocks, including blocks that are - // currently in the free list. + // COUNT: total allocations (active + free) // LOCK: access to this stat is protected by the allocator's blocks_mutex_ + Stat allocations; + // SUM: bytes allocated/reserved by this memory alocator. This accounts + // for both free and in-use blocks. Stat allocated_bytes; - // COUNT: number of allocations per bucket + // COUNT: number of allocations per bucket (active) + // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ + std::vector active_bucket_stats = std::vector(MAX_SIZE_INDEX); + // SUM: bytes of allocation per bucket (active) + // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ + std::vector active_bytes_bucket_stats = std::vector(MAX_SIZE_INDEX); + // COUNT: number of allocations per bucket (active + free) // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ std::vector allocation_bucket_stats = std::vector(MAX_SIZE_INDEX); - // SUM: bytes of allocation per bucket + // SUM: bytes of allocation per bucket (active + free) // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_ std::vector allocated_bytes_bucket_stats = std::vector(MAX_SIZE_INDEX); // SUM: time spent in cudaHostAlloc/cudaHostRegister @@ -211,12 +258,6 @@ struct CachingHostAllocatorImpl { // Check in the recently freed blocks with pending events to see if we // can reuse them. Call get_free_block again after processing events if (pinned_use_background_threads()) { - process_events_for_specific_size(roundSize); - block = get_free_block(roundSize); - if (block) { - return {block->ptr_, reinterpret_cast(block)}; - } - // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { getBackgroundThreadPool()->run([&]() { @@ -278,8 +319,6 @@ struct CachingHostAllocatorImpl { auto index = size_index(block->size_); std::lock_guard g(free_list_[index].mutex_); free_list_[index].list_.push_back(block); - stats_.allocation_bucket_stats[index].decrease(1); - stats_.allocated_bytes_bucket_stats[index].decrease(block->size_); } else { // restore these events that record by used streams. std::lock_guard g(events_mutex_); @@ -339,9 +378,12 @@ struct CachingHostAllocatorImpl { for (auto* block : blocks_to_remove) { blocks_.erase(block); ptr_to_block_.erase(block->ptr_); - stats_.allocation.decrease(1); - stats_.allocated_bytes.decrease(block->size_); + auto index = size_index(block->size_); free_block(block); + stats_.allocations.decrease(1); + stats_.allocated_bytes.decrease(block->size_); + stats_.allocation_bucket_stats[index].decrease(1); + stats_.allocated_bytes_bucket_stats[index].decrease(block->size_); delete block; } } @@ -388,16 +430,17 @@ struct CachingHostAllocatorImpl { // per bucket (we pick index 0 arbitrarily). These are also all the host // allocations, not taking into account caching and free lists. if (i == 0) { - stats.segment = stats_.allocation; - stats.reserved_bytes = stats_.allocated_bytes; - stats.num_host_alloc = stats.segment.allocated; - stats.num_host_free = stats.segment.freed; + stats.allocations = stats_.allocations; + stats.allocated_bytes = stats_.allocated_bytes; + stats.num_host_alloc = stats.allocations.allocated; + stats.num_host_free = stats.allocations.freed; } // Bucket stats need to be merged with the slow-path stats. We do this in // a best effort manner, since we can't really replay the cached events per bucket. - add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]); - add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]); + add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]); + add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]); + stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated; } // Get the timing stats @@ -421,9 +464,11 @@ struct CachingHostAllocatorImpl { std::lock_guard gb(blocks_mutex_, std::adopt_lock); if (i == 0) { - stats_.allocation.reset_accumulated(); + stats_.allocations.reset_accumulated(); stats_.allocated_bytes.reset_accumulated(); } + stats_.active_bucket_stats[i].reset_accumulated(); + stats_.active_bytes_bucket_stats[i].reset_accumulated(); stats_.allocation_bucket_stats[i].reset_accumulated(); stats_.allocated_bytes_bucket_stats[i].reset_accumulated(); } @@ -446,9 +491,11 @@ struct CachingHostAllocatorImpl { std::lock_guard gb(blocks_mutex_, std::adopt_lock); if (i == 0) { - stats_.allocation.reset_peak(); + stats_.allocations.reset_peak(); stats_.allocated_bytes.reset_peak(); } + stats_.active_bucket_stats[i].reset_peak(); + stats_.active_bytes_bucket_stats[i].reset_peak(); stats_.allocation_bucket_stats[i].reset_peak(); stats_.allocated_bytes_bucket_stats[i].reset_peak(); } @@ -465,7 +512,7 @@ struct CachingHostAllocatorImpl { virtual void add_allocated_block(B* block) { std::lock_guard g(blocks_mutex_); blocks_.insert(block); - stats_.allocation.increase(1); + stats_.allocations.increase(1); stats_.allocated_bytes.increase(block->size_); ptr_to_block_.insert({block->ptr_, block}); @@ -478,6 +525,8 @@ struct CachingHostAllocatorImpl { std::lock_guard g(free_list_[index].mutex_); stats_.allocation_bucket_stats[index].increase(1); stats_.allocated_bytes_bucket_stats[index].increase(size); + stats_.active_bucket_stats[index].increase(1); + stats_.active_bytes_bucket_stats[index].increase(size); } } @@ -488,8 +537,8 @@ struct CachingHostAllocatorImpl { B* block = free_list_[index].list_.back(); free_list_[index].list_.pop_back(); block->allocated_ = true; - stats_.allocation_bucket_stats[index].increase(1); - stats_.allocated_bytes_bucket_stats[index].increase(size); + stats_.active_bucket_stats[index].increase(1); + stats_.active_bytes_bucket_stats[index].increase(size); return block; } return nullptr; @@ -583,8 +632,8 @@ struct CachingHostAllocatorImpl { auto index = size_index(block->size_); std::lock_guard g(free_list_[index].mutex_); free_list_[index].list_.push_back(block); - stats_.allocation_bucket_stats[index].decrease(1); - stats_.allocated_bytes_bucket_stats[index].decrease(size); + stats_.active_bucket_stats[index].decrease(1); + stats_.active_bytes_bucket_stats[index].decrease(size); if (size != -1) { return; } diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp index eaca01fe5e09..0bbeb9ddc13a 100644 --- a/aten/src/ATen/core/NamedTensor.cpp +++ b/aten/src/ATen/core/NamedTensor.cpp @@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) { } void check_names_valid_for(const TensorBase& tensor, DimnameList names) { - return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names); + impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names); } void check_names_valid_for(size_t tensor_dim, DimnameList names) { diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h index 81998e160185..52acae90b128 100644 --- a/aten/src/ATen/core/NamedTensor.h +++ b/aten/src/ATen/core/NamedTensor.h @@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface { HasNonWildcard }; - explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names) + explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names) : names_(names.vec()) { check_invariants(); } - explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector&& names) + explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector&& names) : names_(std::move(names)) { check_invariants(); } @@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface { std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); })); } - void set_names(HAS_NON_WILDCARD, DimnameList new_names) { + void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) { TORCH_INTERNAL_ASSERT(new_names.size() == names_.size()); std::copy(new_names.begin(), new_names.end(), names_.begin()); check_invariants(); } - void set_names(HAS_NON_WILDCARD, std::vector&& new_names) { + void set_names(HAS_NON_WILDCARD /*unused*/, std::vector&& new_names) { TORCH_INTERNAL_ASSERT(new_names.size() == names_.size()); names_ = std::move(new_names); check_invariants(); diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h index 413055d3fad6..e8bac545933c 100644 --- a/aten/src/ATen/core/PhiloxRNGEngine.h +++ b/aten/src/ATen/core/PhiloxRNGEngine.h @@ -229,10 +229,10 @@ class philox_engine { } - static const uint32_t kPhilox10A = 0x9E3779B9; - static const uint32_t kPhilox10B = 0xBB67AE85; - static const uint32_t kPhiloxSA = 0xD2511F53; - static const uint32_t kPhiloxSB = 0xCD9E8D57; + static constexpr uint32_t kPhilox10A = 0x9E3779B9; + static constexpr uint32_t kPhilox10B = 0xBB67AE85; + static constexpr uint32_t kPhiloxSA = 0xD2511F53; + static constexpr uint32_t kPhiloxSB = 0xCD9E8D57; }; typedef philox_engine Philox4_32; diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index efd9508ce15c..39f4e7cb6976 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace { @@ -53,20 +54,24 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_ TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value()); // StashTLSOnEntryGuard stash_guard; - c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset); + c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset); + const auto& schema = op.schema(); + const auto num_arguments = schema.arguments().size(); // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); if (mode_stack_len > 0) { + RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments)); const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack); return; } + RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments)); + // Otherwise, find a PyInterpreter on a Tensor - const auto& schema = op.schema(); - const auto num_arguments = schema.arguments().size(); + // It is safe to dispatch on the very first Tensor with a pyobj_interpreter // without checking the interpreters of any of the arguments, because when // we actually run dispatch(), we will take out PyObjects in the context diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h index bec323c7d25b..83b39de34d78 100644 --- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h +++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h @@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final { public: // Returns true if you successfully registered yourself (that means // you are in the hot seat for doing the operator registrations!) - static bool registerInterpreter(c10::impl::PyInterpreter*); + static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/); // Returns nullptr if no interpreter has been registered yet. static c10::impl::PyInterpreter* getInterpreter(); diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index 246418ad7ce8..c5f887f096cd 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs, const std::optional& gradient, std::optional keep_graph, bool create_graph) const { - return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph); + impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph); } const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const { @@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function return impl::GetVariableHooks()->_register_hook(*this, std::move(hook)); } +std::optional TensorBase::grad_dtype() const { + return impl::GetVariableHooks()->grad_dtype(*this); +} + +void TensorBase::set_grad_dtype(const std::optional& grad_dtype) const { + return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype); +} + } // namespace at diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 5f43738ea0fa..1d0a3e73a5a5 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -100,7 +100,7 @@ class TORCH_API TensorBase { // Create a Tensor with a +0 reference count. Special care must be // taken to avoid decrementing this reference count at destruction // time. Intended to support MaybeOwnedTraits. - explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs) + explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs) : impl_(c10::intrusive_ptr(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {} friend MaybeOwnedTraits; @@ -930,6 +930,10 @@ class TORCH_API TensorBase { const TensorBase& requires_grad_(bool _requires_grad=true) const; + std::optional grad_dtype() const; + + void set_grad_dtype(const std::optional& grad_dtype) const; + // View Variables //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -950,7 +954,7 @@ class TORCH_API TensorBase { c10::intrusive_ptr impl_; private: - TensorBase __dispatch_contiguous(c10::MemoryFormat) const; + TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const; }; inline DeviceIndex get_device(const TensorBase& self) { diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h index f81018a8e674..dad18bd019bb 100644 --- a/aten/src/ATen/core/TransformationHelper.h +++ b/aten/src/ATen/core/TransformationHelper.h @@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) { template <> C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) { // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function - return median + sigma * at::tan(c10::pi * (val - static_cast(0.5))); + return median + sigma * at::tan(c10::pi * (val - 0.5)); } /** diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h index f9c0aa4a5fc1..c0f270700e3c 100644 --- a/aten/src/ATen/core/VariableHooksInterface.h +++ b/aten/src/ATen/core/VariableHooksInterface.h @@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface { const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0; + virtual std::optional grad_dtype(const TensorBase&) const = 0; + virtual void set_grad_dtype(const TensorBase&, const std::optional&) const = 0; }; TORCH_API void SetVariableHooks(VariableHooksInterface* hooks); diff --git a/aten/src/ATen/core/boxing/BoxedKernel.h b/aten/src/ATen/core/boxing/BoxedKernel.h index 62b915885a80..c5e46d8de000 100644 --- a/aten/src/ATen/core/boxing/BoxedKernel.h +++ b/aten/src/ATen/core/boxing/BoxedKernel.h @@ -18,10 +18,10 @@ class KernelFunction; // implementation notes; notably, this does NOT actually go through the // boxing/unboxing codepath. TORCH_API void fallthrough_kernel( - OperatorKernel*, - const OperatorHandle&, - DispatchKeySet, - Stack*); + OperatorKernel* /*unused*/, + const OperatorHandle& /*unused*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); // Note [Ambiguity in AutogradOther kernel] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel( // than arbitrarily pick one or the other, we just register a kernel that raises // an error and let the user decide how to proceed. TORCH_API void ambiguous_autogradother_kernel( - OperatorKernel*, - const OperatorHandle&, - DispatchKeySet, - Stack*); + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); // Note [named_not_supported_kernel] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel( // give a good error message in cases when boxing is not supported). When // boxing is universally supported this can be removed. [[noreturn]] TORCH_API void named_not_supported_kernel( - OperatorKernel*, - const OperatorHandle&, - DispatchKeySet, - Stack*); + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); /** * BoxedKernel is similar to a std::function storing a boxed kernel. @@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final { template static void make_boxed_function( - OperatorKernel*, + OperatorKernel* /*unused*/, const OperatorHandle& opHandle, - DispatchKeySet, + DispatchKeySet /*unused*/, Stack* stack); template static void make_boxed_function( - OperatorKernel*, + OperatorKernel* /*unused*/, const OperatorHandle& opHandle, - DispatchKeySet, + DispatchKeySet /*ks*/, Stack* stack); explicit BoxedKernel( diff --git a/aten/src/ATen/core/boxing/BoxedKernel_impl.h b/aten/src/ATen/core/boxing/BoxedKernel_impl.h index 1960607c6bc8..04ba1368f070 100644 --- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h +++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h @@ -2,7 +2,7 @@ namespace c10 { -inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {} +inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {} inline BoxedKernel::BoxedKernel( std::unique_ptr functor, @@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel( template inline void BoxedKernel::make_boxed_function( - OperatorKernel*, + OperatorKernel* /*unused*/, const OperatorHandle& opHandle, - DispatchKeySet, + DispatchKeySet /*unused*/, Stack* stack) { // Note that we're dropping the DispatchKeySet argument. // See Note [Plumbing Keys Through The Dispatcher 2] for details. @@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function( template inline void BoxedKernel::make_boxed_function( - OperatorKernel*, + OperatorKernel* /*unused*/, const OperatorHandle& opHandle, DispatchKeySet ks, Stack* stack) { diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index c099c456814a..dd2fb32e6817 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -10,7 +10,7 @@ namespace c10 { // be handled specially. Its semantics is that it redispatches to the // *next* dispatch key that would have been processed, skipping the current // one. -void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) { +void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) { TORCH_INTERNAL_ASSERT(0, "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. " "This could occur if you registered a fallthrough kernel as a override for a specific operator " @@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, "let us know in the bug tracker."); } -void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) { +void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) { TORCH_INTERNAL_ASSERT(0, op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. " "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering " @@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n"); } -void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) { +void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) { // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING // See Note [named_not_supported_kernel] TORCH_CHECK(0, diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index 4300217235b8..eb0cf833dfc2 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -229,7 +229,7 @@ class TORCH_API KernelFunction final { * &unboxed_func>(); */ template - static KernelFunction makeFromUnboxedFunction(FuncPtr); + static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/); /** * Create a KernelFunction from an unboxed function. @@ -271,7 +271,7 @@ class TORCH_API KernelFunction final { std::string dumpState() const; // For testing internal invariants only - bool _equalsBoxedAndUnboxed(const KernelFunction&) const; + bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const; // Register a token to be invalidated when this KernelFunction is destroyed void registerToken(std::weak_ptr token) const; diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 672309ec19a2..bb981c1d4efd 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -20,9 +20,7 @@ make_unique_base(Args&&... args) { } // namespace detail inline KernelFunction::KernelFunction() - : boxed_kernel_func_(), - unboxed_kernel_func_(nullptr), - sym_unboxed_kernel_func_(nullptr) {} + : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} inline KernelFunction::~KernelFunction() { if (tokens_) { diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h index 68e25cccd44c..7fbc3b982609 100644 --- a/aten/src/ATen/core/boxing/impl/boxing.h +++ b/aten/src/ATen/core/boxing/impl/boxing.h @@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack( new (dest++) IValue(options.pinned_memory()); } -inline void boxArgsToStack(IValue*&) {} +inline void boxArgsToStack(IValue*& /*unused*/) {} template C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack( @@ -185,7 +185,7 @@ struct PopResult> final { template static Result pop_to_tuple_impl( Stack& stack, - std::index_sequence) { + std::index_sequence /*unused*/) { return std::make_tuple((std::move(stack[indices]).template to())...); } }; diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index 20dfde846e64..34b1514f32cd 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_< // doesn't use && static ReturnType call( OperatorKernel* functor, - DispatchKeySet, + DispatchKeySet /*unused*/, ParameterTypes... args) { KernelFunctor* functor_ = static_cast(functor); // Note [Plumbing Keys Through The Dispatcher 2] @@ -629,8 +629,8 @@ call_functor_with_args_from_stack_( OperatorKernel* functor, DispatchKeySet dispatchKeySet, Stack* stack, - std::index_sequence, - guts::typelist::typelist*) { + std::index_sequence /*unused*/, + guts::typelist::typelist* /*unused*/) { (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would // be unused and we have to silence the compiler warning. @@ -708,7 +708,7 @@ struct push_outputs, AllowDeprecatedTypes> final { static void call_( std::tuple&& output, Stack* stack, - std::index_sequence) { + std::index_sequence /*unused*/) { torch::jit::push( *stack, return_to_ivalue::call( @@ -718,7 +718,7 @@ struct push_outputs, AllowDeprecatedTypes> final { static void copy_( const std::tuple& output, Stack* stack, - std::index_sequence) { + std::index_sequence /*unused*/) { torch::jit::push( *stack, return_to_ivalue::copy( @@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final { static void call( OperatorKernel* functor, - const OperatorHandle&, + const OperatorHandle& /*unused*/, DispatchKeySet dispatchKeySet, Stack* stack) { using ReturnType = diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index 5ab1ace1685f..8c837871dff7 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function { bool call( Stack& stack, - std::optional, - c10::function_ref) override { + std::optional /*unused*/, + c10::function_ref /*unused*/) override { run(stack); return false; } - bool call(Stack& stack, c10::function_ref) + bool call(Stack& stack, c10::function_ref /*unused*/) override { run(stack); return false; diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index ecc4bc7b5d89..dbd00e9c5290 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -80,7 +80,8 @@ struct MultiDispatchKeySet : at::IterArgs { ts = ts | x.key_set(); } } - [[noreturn]] void operator()(at::ArrayRef>) { + [[noreturn]] void operator()( + at::ArrayRef> /*unused*/) { // Just checking that the handling of Tensor?[] didn't change. TORCH_INTERNAL_ASSERT(false); } @@ -95,7 +96,7 @@ struct MultiDispatchKeySet : at::IterArgs { } } template - void operator()(const T&) { + void operator()(const T& /*unused*/) { // do nothing } }; diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 91a5f6459617..4f9d7c6ec0db 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -76,13 +76,7 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name, OpRegistrationListener::~OpRegistrationListener()= default; -Dispatcher::Dispatcher() -: operators_() -, operatorLookupTable_() -, backendFallbackKernels_() -, listeners_(std::make_unique()) -, cond_var_() -, guard_(std::make_shared()) +Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique()), guard_(std::make_shared()) {} Dispatcher::~Dispatcher() { diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 43eb0028c70f..29139a294745 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -96,7 +96,7 @@ class TORCH_API Dispatcher final { friend class TypedOperatorHandle; struct Guard final { - Guard() : alive(true), mutex() {} + Guard() : alive(true) {} std::atomic alive; std::mutex mutex; }; @@ -496,7 +496,7 @@ class TORCH_API OperatorHandle { } void checkInvariants() const { - return operatorDef_->op.checkInvariants(); + operatorDef_->op.checkInvariants(); } c10::ArrayRef getTags() const { @@ -633,7 +633,7 @@ class TypedOperatorHandle final : public OperatorHandle { namespace detail { template -inline void unused_arg_(const Args&...) {} +inline void unused_arg_(const Args&... /*unused*/) {} // CaptureKernelCall is intended to capture return values from Dispatcher // unboxed kernel calls. A record function may request to get outputs from the @@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed( } #endif const auto& kernel = entry.lookup(dispatchKeySet); - return kernel.callBoxed(op, dispatchKeySet, stack); + kernel.callBoxed(op, dispatchKeySet, stack); } } // namespace c10 diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index c172e9b9c609..7040049ddf1e 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -62,17 +62,7 @@ static const auto& getDispatchTableIndexToKey() { } OperatorEntry::OperatorEntry(OperatorName&& operator_name) -: name_(std::move(operator_name)) -, schema_() -#ifndef C10_MOBILE -, tags_() -#endif -, dispatchTable_() -, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()) -, kernels_() -, cpp_signature_() -, sym_cpp_signature_() -, is_observed_(ObservedOperators::isObserved(name_)) +: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_)) { // Pick up any backend fallbacks that were registered prior to this // OperatorEntry being created. diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 59b54ce1d9d3..cc5736ba0e77 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final { // versa that is an error. (Refcounting for the registrations is // handled in the OperatorHandle in Dispatcher) void registerSchema( - FunctionSchema&&, + FunctionSchema&& /*schema*/, std::string&& debug, std::vector tags = {}); void deregisterSchema(); diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp index d4596ed2ca73..2b1a32bd0ac8 100644 --- a/aten/src/ATen/core/dynamic_type.cpp +++ b/aten/src/ATen/core/dynamic_type.cpp @@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const { return equals(*create(rhs)); } -bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const { +bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const { auto other = create(rhs); if (tag_ == other->tag_) { if (equals(*other)) { @@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory::create( } DynamicTypePtr ivalue::TupleTypeFactory::fallback( - const Type&) { + const Type& /*unused*/) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return nullptr; } diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index 2ba841e44e20..ee0d077e5c51 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -138,8 +138,8 @@ class DynamicType : public SharedType { struct Arguments { Arguments() = default; - Arguments(c10::ArrayRef); - Arguments(const std::vector&, c10::ArrayRef); + Arguments(c10::ArrayRef /*args*/); + Arguments(const std::vector& /*names*/, c10::ArrayRef /*args*/); std::vector elems; }; @@ -156,15 +156,15 @@ class DynamicType : public SharedType { static const TypeKind Kind = TypeKind::DynamicType; static TORCH_API DynamicTypePtr create(Type& ty); - explicit DynamicType(Tag, Arguments); - explicit DynamicType(Tag, std::string_view, Arguments); + explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/); + explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/); DynamicType(DynamicType&& other) = delete; DynamicType(const DynamicType&) = delete; DynamicType& operator=(const DynamicType&) = delete; DynamicType& operator=(DynamicType&&) = delete; - TypePtr containedType(size_t) const override; + TypePtr containedType(size_t /*i*/) const override; size_t containedTypeSize() const override; Tag tag() const { return tag_; diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h index 7e8a765a05ab..83db2ec9d71d 100644 --- a/aten/src/ATen/core/function.h +++ b/aten/src/ATen/core/function.h @@ -96,15 +96,15 @@ struct TORCH_API Function { // Overload for server interpreter, a bailout size is needed for graph // executor. virtual bool call( - Stack&, - std::optional, - c10::function_ref) { + Stack& /*unused*/, + std::optional /*unused*/, + c10::function_ref /*unused*/) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } // Overload for mobile interpreter. - virtual bool call(Stack&, c10::function_ref) { + virtual bool call(Stack& /*unused*/, c10::function_ref /*unused*/) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 72589436606e..264c7aff2cca 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const { case Tag::Enum: return lhs.toEnumHolder()->is(*rhs.toEnumHolder()); case Tag::Uninitialized: - // Unitialized ivalues show up in no-ops when the compiler can prove a + // Uninitialized ivalues show up in no-ops when the compiler can prove a // value will never be used. Just return false on any equality comparison. return false; } diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index ab2039e05820..d9516ed900e3 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -624,7 +624,14 @@ struct TORCH_API IValue final { IValue(const c10::SymBool& i) { if (auto mi = i.maybe_as_bool()) { tag = Tag::Bool; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ payload.u.as_int = *mi; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* due to byteorder if value assigned as_int, as_bool actually is not set correctly */ + payload.u.as_bool = *mi; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } else { tag = Tag::SymBool; payload.u.as_intrusive_ptr = i.toSymNodeImpl().release(); @@ -847,7 +854,7 @@ struct TORCH_API IValue final { IValue(std::optional v); template = nullptr> IValue(c10::OptionalArrayRef v); - IValue(std::nullopt_t); + IValue(std::nullopt_t /*unused*/); // ClassType IValue(c10::intrusive_ptr v); diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 1251c4c0c210..89759560c3ea 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory { template <> struct TORCH_API TupleTypeFactory { static DynamicTypePtr create(const std::vector& elemTypes); - static DynamicTypePtr fallback(const Type&); + static DynamicTypePtr fallback(const Type& /*unused*/); }; struct TORCH_API Tuple : c10::intrusive_ptr_target { @@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target { namespace detail { struct _guarded_unsigned_long_unique_dummy final { - _guarded_unsigned_long_unique_dummy(int64_t){} + _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){} }; using _guarded_unsigned_long = std::conditional_t< std::is_same_v || @@ -1776,7 +1776,7 @@ template // native_functions.yaml still return std::vector. // C10_DEPRECATED_MESSAGE("IValues based on std::vector are potentially slow // and deprecated. Please use torch::List instead.") -std::vector generic_to(IValue ivalue, _fake_type>) { +std::vector generic_to(IValue ivalue, _fake_type> /*unused*/) { // We need to do a deep copy of the vector because there might be other // references to this same IValue that also use the list. We can't just // move the elements out. @@ -1826,18 +1826,18 @@ c10::intrusive_ptr IValue::toCustomClass() const& { } template -T generic_to(IValue ivalue, _fake_type) { +T generic_to(IValue ivalue, _fake_type /*unused*/) { using ElemType = typename std::remove_pointer::type::element_type; return std::move(ivalue).template toCustomClass(); } template -tagged_capsule generic_to(IValue ivalue, _fake_type>) { +tagged_capsule generic_to(IValue ivalue, _fake_type> /*unused*/) { return tagged_capsule{std::move(ivalue)}; } template -c10::List generic_to(IValue ivalue, _fake_type>) { +c10::List generic_to(IValue ivalue, _fake_type> /*unused*/) { return impl::toTypedList(std::move(ivalue).toList()); } @@ -1867,7 +1867,7 @@ std::vector createVectorFromList(const c10::List& impl) { } template -OptionalArray generic_to(IValue ivalue, _fake_type>) { +OptionalArray generic_to(IValue ivalue, _fake_type> /*unused*/) { if (ivalue.isNone()) { return {}; } @@ -1880,8 +1880,8 @@ namespace detail { template std::array generic_to_array( IValue ivalue, - _fake_type>, - std::index_sequence) { + _fake_type> /*unused*/, + std::index_sequence /*unused*/) { // We need to do a deep copy of the array because there might be other // references to this same IValue that also use the list. We can't just // move the elements out. @@ -1906,7 +1906,7 @@ std::array generic_to( template c10::Dict generic_to( IValue ivalue, - _fake_type>) { + _fake_type> /*unused*/) { return impl::toTypedDict(std::move(ivalue).toGenericDict()); } @@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE( "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict instead.") std::unordered_map generic_to( IValue ivalue, - _fake_type>) { + _fake_type> /*unused*/) { std::unordered_map specialized_dict; for (const auto& item : std::move(ivalue).toGenericDict()) { @@ -1926,7 +1926,7 @@ std::unordered_map generic_to( } template -std::optional generic_to(IValue ivalue, _fake_type>) { +std::optional generic_to(IValue ivalue, _fake_type> /*unused*/) { if (ivalue.isNone()) { return std::nullopt; } @@ -1937,7 +1937,7 @@ namespace detail { template Tuple generic_to_tuple_impl( const ivalue::TupleElements& t, - std::index_sequence) { + std::index_sequence /*unused*/) { return std::make_tuple( t[INDEX].to::type>()...); } @@ -1951,7 +1951,7 @@ template < std::is_lvalue_reference..., std::negation>...>, std::nullptr_t> = nullptr> -std::tuple generic_to(const IValue& ivalue, _fake_type>) { +std::tuple generic_to(const IValue& ivalue, _fake_type> /*unused*/) { const auto& vals = ivalue.toTupleRef().elements(); TORCH_CHECK(vals.size() == sizeof...(Args)); return detail::generic_to_tuple_impl>(vals, Indices{}); @@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional v) : IValue() { } } -inline IValue::IValue(std::nullopt_t) : IValue() {} +inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Object) { @@ -2482,15 +2482,15 @@ namespace ivalue { namespace detail { template -IValue from_(T&& x, std::true_type) { +IValue from_(T&& x, std::true_type /*unused*/) { return IValue(std::forward(x)); } template -IValue from_(c10::intrusive_ptr x, std::false_type) { +IValue from_(c10::intrusive_ptr x, std::false_type /*unused*/) { return IValue(std::move(x)); } template -IValue from_(T&& /*x*/, std::false_type) { +IValue from_(T&& /*x*/, std::false_type /*unused*/) { static_assert( guts::false_t::value, "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)"); @@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits { return &borrow; } - static bool debugBorrowIsValid(const borrow_type&) { + static bool debugBorrowIsValid(const borrow_type& /*unused*/) { return true; } }; template <> struct IValue::TagType { - static TORCH_API c10::TypePtr get(const IValue&); + static TORCH_API c10::TypePtr get(const IValue& /*v*/); }; template <> struct IValue::TagType { - static TORCH_API c10::TypePtr get(const IValue&); + static TORCH_API c10::TypePtr get(const IValue& /*v*/); }; template diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index c15e5f72af27..d8e7b7e8b55a 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType { std::shared_ptr schema_; }; -// the common supertype of all Enums, only used in operator registraion. +// the common supertype of all Enums, only used in operator registration. // EnumType <: AnyEnumType for all Enums struct AnyEnumType; using AnyEnumTypePtr = SingletonTypePtr; diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index a393e0290458..0ee79ed85930 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -44,7 +44,7 @@ constexpr int checkStaticTypes() { } template -constexpr std::array createArgumentVectorFromTypes(std::index_sequence) { +constexpr std::array createArgumentVectorFromTypes(std::index_sequence /*unused*/) { return ( // Check types for common errors checkStaticTypes(), diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h index 3e8e03f9fa4c..1f39ba4e3871 100644 --- a/aten/src/ATen/core/op_registration/op_allowlist.h +++ b/aten/src/ATen/core/op_registration/op_allowlist.h @@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i } next++; } else { - if (allowlist.substr(cur).compare(item) == 0) { + if (allowlist.substr(cur) == item) { return true; } break; diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp index b5ae2290b5ad..b34134309cb7 100644 --- a/aten/src/ATen/core/op_registration/op_registration.cpp +++ b/aten/src/ATen/core/op_registration/op_registration.cpp @@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_( std::optional inferred_schema = std::nullopt; for (const auto& kernel : options.kernels) { - if (nullptr != kernel.inferred_function_schema.get()) { + if (nullptr != kernel.inferred_function_schema) { if (!inferred_schema.has_value()) { inferred_schema = *kernel.inferred_function_schema; break; diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h index 7a44cfa49b07..d441269bf297 100644 --- a/aten/src/ATen/core/op_registration/op_registration.h +++ b/aten/src/ATen/core/op_registration/op_registration.h @@ -411,7 +411,6 @@ class TORCH_API RegisterOperators final { Options() : schemaOrName_(std::nullopt) - , kernels() , aliasAnalysisKind_(std::nullopt) {} @@ -420,7 +419,6 @@ class TORCH_API RegisterOperators final { struct KernelRegistrationConfig final { KernelRegistrationConfig() : dispatch_key(std::nullopt) - , func() , cpp_signature(std::nullopt) , inferred_function_schema(nullptr) {} diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h index 22e1f427b632..4c138ee50456 100644 --- a/aten/src/ATen/core/operator_name.h +++ b/aten/src/ATen/core/operator_name.h @@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) { } TORCH_API std::string toString(const OperatorName& opName); -TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&); +TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/); } // namespace c10 diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h index 0859e04c7d2d..011a1750ecaa 100644 --- a/aten/src/ATen/core/type_ptr.h +++ b/aten/src/ATen/core/type_ptr.h @@ -16,7 +16,7 @@ class SingletonTypePtr { /* implicit */ SingletonTypePtr(T* p) : repr_(p) {} // We need this to satisfy Pybind11, but it shouldn't be hit. - explicit SingletonTypePtr(std::shared_ptr) { TORCH_CHECK(false); } + explicit SingletonTypePtr(std::shared_ptr /*unused*/) { TORCH_CHECK(false); } using element_type = typename std::shared_ptr::element_type; diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h index d269e1073959..9e0b189bdac8 100644 --- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h +++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h @@ -308,8 +308,8 @@ Vectorized inline operator/( } inline Vectorized::Vectorized() { - const short zero = 0; - values = svdup_n_bf16(c10::bit_cast(zero)); + auto vals_f = svdup_n_f32(0); + values = convert_float_bfloat16(vals_f, vals_f); } inline Vectorized::Vectorized(int val) { diff --git a/aten/src/ATen/cpu/vec/vec128/vec128.h b/aten/src/ATen/cpu/vec/vec128/vec128.h index c49580410aaf..6b216f20b0bd 100644 --- a/aten/src/ATen/cpu/vec/vec128/vec128.h +++ b/aten/src/ATen/cpu/vec/vec128/vec128.h @@ -8,6 +8,7 @@ #include #include #include +#include #endif #include diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h b/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h new file mode 100644 index 000000000000..070ba25f8574 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h @@ -0,0 +1,794 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +// Note [CPU_CAPABILITY namespace] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This header, and all of its subheaders, will be compiled with +// different architecture flags for each supported set of vector +// intrinsics. So we need to make sure they aren't inadvertently +// linked together. We do this by declaring objects in an `inline +// namespace` which changes the name mangling, but can still be +// accessed as `at::vec`. +inline namespace CPU_CAPABILITY { + +#define VEC_INT_NEON_TEMPLATE(vl, bit) \ + template <> \ + struct is_vec_specialized_for : std::bool_constant {}; \ + \ + template <> \ + class Vectorized { \ + using neon_type = int##bit##x##vl##_t; \ + \ + private: \ + neon_type values; \ + \ + public: \ + using value_type = int##bit##_t; \ + using size_type = int; \ + static constexpr size_type size() { \ + return vl; \ + } \ + Vectorized() { \ + values = vdupq_n_s##bit(0); \ + } \ + Vectorized(neon_type v) : values(v) {} \ + Vectorized(int##bit##_t val); \ + template < \ + typename... Args, \ + typename = std::enable_if_t<(sizeof...(Args) == size())>> \ + Vectorized(Args... vals) { \ + __at_align__ int##bit##_t buffer[size()] = {vals...}; \ + values = vld1q_s##bit(buffer); \ + } \ + operator neon_type() const { \ + return values; \ + } \ + static Vectorized loadu( \ + const void* ptr, \ + int64_t count = size()); \ + void store(void* ptr, int64_t count = size()) const; \ + template \ + static Vectorized blend( \ + const Vectorized& a, \ + const Vectorized& b); \ + static Vectorized blendv( \ + const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& mask_) { \ + return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \ + } \ + template \ + static Vectorized arange( \ + value_type base = 0, \ + step_t step = static_cast(1)); \ + static Vectorized set( \ + const Vectorized& a, \ + const Vectorized& b, \ + int64_t count = size()); \ + const int##bit##_t& operator[](int idx) const = delete; \ + int##bit##_t& operator[](int idx) = delete; \ + Vectorized abs() const { \ + return vabsq_s##bit(values); \ + } \ + Vectorized real() const { \ + return values; \ + } \ + Vectorized imag() const { \ + return vdupq_n_s##bit(0); \ + } \ + Vectorized conj() const { \ + return values; \ + } \ + Vectorized neg() const { \ + return vnegq_s##bit(values); \ + } \ + int##bit##_t reduce_add() const { \ + return vaddvq_s##bit(values); \ + } \ + int##bit##_t reduce_max() const; \ + Vectorized operator==( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \ + } \ + Vectorized operator!=( \ + const Vectorized& other) const; \ + Vectorized operator<( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \ + } \ + Vectorized operator<=( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \ + } \ + Vectorized operator>( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \ + } \ + Vectorized operator>=( \ + const Vectorized& other) const { \ + return Vectorized( \ + vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \ + } \ + Vectorized eq(const Vectorized& other) const; \ + Vectorized ne(const Vectorized& other) const; \ + Vectorized gt(const Vectorized& other) const; \ + Vectorized ge(const Vectorized& other) const; \ + Vectorized lt(const Vectorized& other) const; \ + Vectorized le(const Vectorized& other) const; \ + }; \ + template <> \ + Vectorized inline operator+( \ + const Vectorized& a, const Vectorized& b) { \ + return vaddq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator-( \ + const Vectorized& a, const Vectorized& b) { \ + return vsubq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator&( \ + const Vectorized& a, const Vectorized& b) { \ + return vandq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator|( \ + const Vectorized& a, const Vectorized& b) { \ + return vorrq_s##bit(a, b); \ + } \ + template <> \ + Vectorized inline operator^( \ + const Vectorized& a, const Vectorized& b) { \ + return veorq_s##bit(a, b); \ + } \ + Vectorized inline Vectorized::eq( \ + const Vectorized& other) const { \ + return (*this == other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ne( \ + const Vectorized& other) const { \ + return (*this != other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::gt( \ + const Vectorized& other) const { \ + return (*this > other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::ge( \ + const Vectorized& other) const { \ + return (*this >= other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::lt( \ + const Vectorized& other) const { \ + return (*this < other) & Vectorized(1); \ + } \ + Vectorized inline Vectorized::le( \ + const Vectorized& other) const { \ + return (*this <= other) & Vectorized(1); \ + } + +VEC_INT_NEON_TEMPLATE(2, 64) +VEC_INT_NEON_TEMPLATE(4, 32) +VEC_INT_NEON_TEMPLATE(8, 16) +VEC_INT_NEON_TEMPLATE(16, 8) + +inline int32_t Vectorized::reduce_max() const { + return vmaxvq_s32(values); +} + +inline int16_t Vectorized::reduce_max() const { + return vmaxvq_s16(values); +} + +inline int8_t Vectorized::reduce_max() const { + return vmaxvq_s8(values); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s32(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s16(a, b); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + return vmulq_s8(a, b); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + int64x2_t val = a; + return ~val; +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s32(a); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s16(a); +} + +template <> +inline Vectorized operator~(const Vectorized& a) { + return vmvnq_s8(a); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +inline Vectorized Vectorized::operator!=( + const Vectorized& other) const { + return ~(*this == other); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s32(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s16(a, b); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + return vminq_s8(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s32(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s16(a, b); +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + return vmaxq_s8(a, b); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint64x2_t maskArray = { + (mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0, + (mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s64(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint32x4_t maskArray = { + (mask & 1LL) ? 0xFFFFFFFF : 0, + (mask & 2LL) ? 0xFFFFFFFF : 0, + (mask & 4LL) ? 0xFFFFFFFF : 0, + (mask & 8LL) ? 0xFFFFFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s32(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint16x8_t maskArray = { + (mask & 1LL) ? 0xFFFF : 0, + (mask & 2LL) ? 0xFFFF : 0, + (mask & 4LL) ? 0xFFFF : 0, + (mask & 8LL) ? 0xFFFF : 0, + (mask & 16LL) ? 0xFFFF : 0, + (mask & 32LL) ? 0xFFFF : 0, + (mask & 64LL) ? 0xFFFF : 0, + (mask & 128LL) ? 0xFFFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s16(maskArray, b.values, a.values); +} + +template +Vectorized Vectorized::blend( + const Vectorized& a, + const Vectorized& b) { + // Build an array of flags: each bit of element is 1 if the corresponding bit + // in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + (mask & 1LL) ? 0xFF : 0, + (mask & 2LL) ? 0xFF : 0, + (mask & 4LL) ? 0xFF : 0, + (mask & 8LL) ? 0xFF : 0, + (mask & 16LL) ? 0xFF : 0, + (mask & 32LL) ? 0xFF : 0, + (mask & 64LL) ? 0xFF : 0, + (mask & 128LL) ? 0xFF : 0, + (mask & 256LL) ? 0xFF : 0, + (mask & 512LL) ? 0xFF : 0, + (mask & 1024LL) ? 0xFF : 0, + (mask & 2048LL) ? 0xFF : 0, + (mask & 4096LL) ? 0xFF : 0, + (mask & 8192LL) ? 0xFF : 0, + (mask & 16384LL) ? 0xFF : 0, + (mask & 32768LL) ? 0xFF : 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s8(maskArray, b.values, a.values); +} + +#define VEC_INT_NEON_OPS(vl, bit) \ + inline Vectorized::Vectorized(int##bit##_t val) { \ + values = vdupq_n_s##bit(val); \ + } \ + inline Vectorized Vectorized::loadu( \ + const void* ptr, int64_t count) { \ + if (count == size()) { \ + return vld1q_s##bit(reinterpret_cast(ptr)); \ + } else { \ + __at_align__ int##bit##_t tmp_values[size()]; \ + for (const auto i : c10::irange(size())) { \ + tmp_values[i] = 0; \ + } \ + std::memcpy( \ + tmp_values, \ + reinterpret_cast(ptr), \ + count * sizeof(int##bit##_t)); \ + return vld1q_s##bit(reinterpret_cast(tmp_values)); \ + } \ + } \ + inline void Vectorized::store(void* ptr, int64_t count) \ + const { \ + if (count == size()) { \ + vst1q_s##bit(reinterpret_cast(ptr), values); \ + } else { \ + int##bit##_t tmp_values[size()]; \ + vst1q_s##bit(reinterpret_cast(tmp_values), values); \ + std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t)); \ + } \ + } + +VEC_INT_NEON_OPS(2, 64) +VEC_INT_NEON_OPS(4, 32) +VEC_INT_NEON_OPS(8, 16) +VEC_INT_NEON_OPS(16, 8) + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return x * y; +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return x / y; +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + int32x4_t x = a; + int32x4_t y = b; + return x / y; +} + +inline int64_t Vectorized::reduce_max() const { + return std::max(values[0], values[1]); +} + +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return {std::min(x[0], y[0]), std::min(x[1], y[1])}; +} + +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + return {std::max(x[0], y[0]), std::max(x[1], y[1])}; +} + +template +inline Vectorized Vectorized::arange( + int64_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int64x2_t step_sizes = {0, 1}; + return base_vec.values + step_sizes * step_vec.values; +} + +template +inline Vectorized Vectorized::arange( + int32_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int32x4_t step_sizes = {0, 1, 2, 3}; + return vmlaq_s32(base_vec, step_sizes, step_vec); +} + +template +inline Vectorized Vectorized::arange( + int16_t base, + step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7}; + return vmlaq_s16(base_vec, step_sizes, step_vec); +} + +template +inline Vectorized Vectorized::arange(int8_t base, step_t step) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const int8x16_t step_sizes = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + return vmlaq_s8(base_vec, step_sizes, step_vec); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int64x2_t x = a; + int64x2_t y = b; + uint64x2_t u = vreinterpretq_u64_s64(y); + uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)}; + return x >> vreinterpretq_s64_u64(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int32x4_t x = a; + int32x4_t y = b; + uint32x4_t bound = vdupq_n_u32(31); + uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound); + return x >> vreinterpretq_s32_u32(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int16x8_t x = a; + int16x8_t y = b; + uint16x8_t bound = vdupq_n_u16(15); + uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound); + return x >> vreinterpretq_s16_u16(z); +} + +template <> +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + int8x16_t x = a; + int8x16_t y = b; + uint8x16_t bound = vdupq_n_u8(7); + int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound)); + return x >> z; +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int64x2_t y = b; + uint64x2_t u = vreinterpretq_u64_s64(y); + uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)}; + return vshlq_s64(a, vreinterpretq_s64_u64(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int32x4_t y = b; + uint32x4_t bound = vdupq_n_u32(32); + uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound); + return vshlq_s32(a, vreinterpretq_s32_u32(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int16x8_t y = b; + uint16x8_t bound = vdupq_n_u16(16); + uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound); + return vshlq_s16(a, vreinterpretq_s16_u16(z)); +} + +template <> +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + int8x16_t y = b; + uint8x16_t bound = vdupq_n_u8(8); + int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound)); + return vshlq_s8(a, z); +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 2) { + return b; + } else { + int64x2_t c = {b.values[0], a.values[1]}; + return c; + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 4) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint32x4_t maskArray = { + (count >= 1LL) ? 0xFFFFFFFF : 0, + (count >= 2LL) ? 0xFFFFFFFF : 0, + (count >= 3LL) ? 0xFFFFFFFF : 0, + 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s32(maskArray, b.values, a.values); + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 8) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint16x8_t maskArray = { + static_cast((count >= 1LL) ? 0xFFFF : 0), + static_cast((count >= 2LL) ? 0xFFFF : 0), + static_cast((count >= 3LL) ? 0xFFFF : 0), + static_cast((count >= 4LL) ? 0xFFFF : 0), + static_cast((count >= 5LL) ? 0xFFFF : 0), + static_cast((count >= 6LL) ? 0xFFFF : 0), + static_cast((count >= 7LL) ? 0xFFFF : 0), + 0}; + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s16(maskArray, b.values, a.values); + } +} + +inline Vectorized Vectorized::set( + const Vectorized& a, + const Vectorized& b, + int64_t count) { + if (count == 0) { + return a; + } else if (count >= 16) { + return b; + } else { + // Build an array of flags: each bit of element is 1 if the corresponding + // bit in 'mask' is set, 0 otherwise. + uint8x16_t maskArray = { + static_cast((count >= 1LL) ? 0xFF : 0), + static_cast((count >= 2LL) ? 0xFF : 0), + static_cast((count >= 3LL) ? 0xFF : 0), + static_cast((count >= 4LL) ? 0xFF : 0), + static_cast((count >= 5LL) ? 0xFF : 0), + static_cast((count >= 6LL) ? 0xFF : 0), + static_cast((count >= 7LL) ? 0xFF : 0), + static_cast((count >= 8LL) ? 0xFF : 0), + static_cast((count >= 9LL) ? 0xFF : 0), + static_cast((count >= 10LL) ? 0xFF : 0), + static_cast((count >= 11LL) ? 0xFF : 0), + static_cast((count >= 12LL) ? 0xFF : 0), + static_cast((count >= 13LL) ? 0xFF : 0), + static_cast((count >= 14LL) ? 0xFF : 0), + static_cast((count >= 15LL) ? 0xFF : 0), + 0}; + + // Use BSL to select elements from b where the mask is 1, else from a + return vbslq_s8(maskArray, b.values, a.values); + } +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + Vectorized highBitsA = vmovl_high_s16(a); + Vectorized highBitsB = vmovl_high_s16(b); + Vectorized lowBitsA = vmovl_s16(vget_low_s16(a)); + Vectorized lowBitsB = vmovl_s16(vget_low_s16(b)); + int32x4_t highBitsResult = highBitsA / highBitsB; + int32x4_t lowBitsResult = lowBitsA / lowBitsB; + return vuzp1q_s16( + vreinterpretq_s16_s32(lowBitsResult), + vreinterpretq_s16_s32(highBitsResult)); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + Vectorized highBitsA = vmovl_high_s8(a); + Vectorized highBitsB = vmovl_high_s8(b); + Vectorized lowBitsA = vmovl_s8(vget_low_s8(a)); + Vectorized lowBitsB = vmovl_s8(vget_low_s8(b)); + int16x8_t highBitsResult = highBitsA / highBitsB; + int16x8_t lowBitsResult = lowBitsA / lowBitsB; + return vuzp1q_s8( + vreinterpretq_s8_s16(lowBitsResult), + vreinterpretq_s8_s16(highBitsResult)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index ba57ca034e9a..735315bee768 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -342,19 +342,19 @@ class Vectorized> { return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ); } Vectorized> operator<( - const Vectorized>&) const { + const Vectorized>& /*unused*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized> operator<=( - const Vectorized>&) const { + const Vectorized>& /*unused*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized> operator>( - const Vectorized>&) const { + const Vectorized>& /*unused*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } Vectorized> operator>=( - const Vectorized>&) const { + const Vectorized>& /*unused*/) const { TORCH_CHECK(false, "not supported for complex numbers"); } diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 515cbff730d9..559db3c97567 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi { // Because loadu(const void* ptr, T count) requires zero initialization for // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128 // bits of the result are undefined. - // TODO We can use _mm256_zextsi128_si256 in the furture, + // TODO We can use _mm256_zextsi128_si256 in the future, // since gcc 9.3 doesn't support it now. __m128i input_128 = _mm_loadl_epi64(reinterpret_cast(ptr)); return _mm256_castsi128_si256(input_128); @@ -1844,7 +1844,7 @@ Vectorized inline shift_256_16( c0 = _mm256_srav_epi32(a0, b0); c0 = _mm256_shuffle_epi8(c0, ctl_1_0); - // Peform shifting the same way for input array elements with + // Perform shifting the same way for input array elements with // idx%2==1. __m256i a1 = _mm256_and_si256(a, keep_1); __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); @@ -2180,7 +2180,7 @@ Vectorized inline shift_256_8( c0 = _mm256_srlv_epi32(a0, b0); c0 = _mm256_shuffle_epi8(c0, ctl_3_0); - // Peform shifting the same way for input array elements with + // Perform shifting the same way for input array elements with // idx%4==1. __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3); __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0); @@ -2193,7 +2193,7 @@ Vectorized inline shift_256_8( c1 = _mm256_srlv_epi32(a1, b1); c1 = _mm256_shuffle_epi8(c1, ctl_3_1); - // Peform shifting the same way for input array elements with + // Perform shifting the same way for input array elements with // idx%4==2. __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3); __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0); @@ -2206,7 +2206,7 @@ Vectorized inline shift_256_8( c2 = _mm256_srlv_epi32(a2, b2); c2 = _mm256_shuffle_epi8(c2, ctl_3_2); - // Peform shifting the same way for input array elements with + // Perform shifting the same way for input array elements with // idx%4==3. __m256i a3 = _mm256_and_si256(a, keep_3); __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index dafe444163eb..145ac7aee567 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -1377,7 +1377,7 @@ Vectorized inline maximum( #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)) std::pair, Vectorized> inline convert_int8_to_float( at::vec::Vectorized src) { - auto s8x8 = vld1_s8(src.operator const int8_t*()); + auto s8x8 = vget_low_s8(src); auto s16x8 = vmovl_s8(s8x8); auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8)); @@ -1402,7 +1402,7 @@ std::pair, Vectorized> inline convert_int8_to_float( Vectorized inline convert_int8_half_register_to_float( at::vec::Vectorized src) { - auto s8x8 = vld1_s8(src.operator const int8_t*()); + auto s8x8 = vget_low_s8(src); auto s16x8 = vmovl_s8(s8x8); auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8)); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index 5f80a7c2bcff..8b2768fab6a3 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi { // Because loadu(const void* ptr, T count) requires zero initialization for // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384 // bits of the result are undefined. - // TODO We can use _mm512_zextsi128_si512 in the furture, + // TODO We can use _mm512_zextsi128_si512 in the future, // since gcc 9.3 doesn't support it now. __m128i input_128 = _mm_loadu_si128(reinterpret_cast(ptr)); return _mm512_castsi128_si512(input_128); @@ -2022,7 +2022,7 @@ Vectorized inline shift_512_8( c0 = _mm512_srlv_epi16(a0, b0); c0 = _mm512_shuffle_epi8(c0, ctl_1_0); - // Peform shifting the same way for input array elements with + // Perform shifting the same way for input array elements with // idx%2==1. __m512i a1 = _mm512_and_si512(a, keep_1); __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0); diff --git a/aten/src/ATen/cpu/vec/vec_quant.h b/aten/src/ATen/cpu/vec/vec_quant.h index 36602c4a760f..ae9e86c6a9c8 100644 --- a/aten/src/ATen/cpu/vec/vec_quant.h +++ b/aten/src/ATen/cpu/vec/vec_quant.h @@ -149,5 +149,105 @@ static inline void pack_vnni4( #endif } +// This is a helper function for transpose_pack_vnni4 +// Transform a [4, 16] block (with incontiguous output) +// Src: +// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 +// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16 +// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 +// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16 +// Dst: +// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4 +// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8 +// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12 +// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16 +template > +static inline void transpose_vnni4_pad_4x16_block( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t ld_dst, + int krem = 4) { +#if defined(CPU_CAPABILITY_AVX512) + __m128i r[4]; + for (int i = 0; i < krem; ++i) { + r[i] = _mm_loadu_si128(reinterpret_cast(src + i * ld_src)); + } + for (int i = krem; i < 4; ++i) { + r[i] = _mm_setzero_si128(); + } + + // Transpose 4x16 bytes using unpack and shuffle + __m128i t0 = _mm_unpacklo_epi32(r[0], r[1]); + __m128i t1 = _mm_unpackhi_epi32(r[0], r[1]); + __m128i t2 = _mm_unpacklo_epi32(r[2], r[3]); + __m128i t3 = _mm_unpackhi_epi32(r[2], r[3]); + + __m128i r0 = _mm_unpacklo_epi64(t0, t2); + __m128i r1 = _mm_unpackhi_epi64(t0, t2); + __m128i r2 = _mm_unpacklo_epi64(t1, t3); + __m128i r3 = _mm_unpackhi_epi64(t1, t3); + + // Store output + if (krem == 4) { + // normal case + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3); + } else { + // masked case + __mmask16 mask = (1ULL << (krem * 4)) - 1; + _mm_mask_storeu_epi8(dst, mask, r0); + _mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1); + _mm_mask_storeu_epi8( + reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2); + _mm_mask_storeu_epi8( + reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3); + } +#else + TORCH_CHECK( + false, + "transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported") +#endif +} + +// Do the transpose packing fusion with VNNI4 +// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8) +template > +static inline void transpose_pack_vnni4( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t K, + int64_t N) { +#if defined(CPU_CAPABILITY_AVX512) + TORCH_CHECK( + N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4"); + int64_t bk = 0; + int64_t _K = K / 4 * 4; + for (; bk < _K; bk += 4) { + int64_t bn = 0; + for (; bn < N; bn += 16) { + transpose_vnni4_pad_4x16_block( + src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4); + } + } + + // Handle leftover K rows (< 4) + if (K % 4 != 0) { + int krem = K - bk; + int64_t bn = 0; + for (; bn < N; bn += 16) { + transpose_vnni4_pad_4x16_block( + src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem); + } + } +#else + TORCH_CHECK( + false, "transpose_pack_vnni4 is only supported when AVX-512 is supported") +#endif +} + } // namespace CPU_CAPABILITY } // namespace at::vec diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index a81d34df4d64..6933099bb1f3 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -16,6 +16,8 @@ #include #include +#include + #ifdef USE_ROCM #include #include @@ -108,7 +110,7 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error) namespace { -static cublasOperation_t _cublasOpFromChar(char op) { +cublasOperation_t _cublasOpFromChar(char op) { // NOLINTNEXTLINE(bugprone-switch-missing-default-case) switch (op) { case 'n': @@ -128,7 +130,7 @@ static cublasOperation_t _cublasOpFromChar(char op) { "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); } -static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { +void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { // Note: leading dimensions generally are checked that they are > 0 // and at least as big the result requires (even if the value won't // be used). @@ -142,7 +144,7 @@ static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) { *lda = std::max(m, 1); } -static void _cublasAdjustLdLevel3( +void _cublasAdjustLdLevel3( char transa, char transb, int64_t m, @@ -191,6 +193,10 @@ uint32_t _getAlignment(uintptr_t address) { #ifdef USE_ROCM static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) { + // 0 is default value, meaning full CUs i.e. no mask + if (value == 0) { + return at::cuda::getCurrentCUDAStream(); + } static int32_t last_value = 0; static hipStream_t stream; if (last_value == 0) { @@ -209,15 +215,15 @@ static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) { int32_t CUs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; // how many uint32_t do we need to cover all CUs, fill bitmask with 1 uint32_t mask_size = static_cast((CUs + 32 - 1) / 32); - std::vector mask(mask_size, uint32_t{0xffffffff}); + std::vector mask(mask_size, uint32_t{0x00000000}); // starting from lowest order bits, in 32-bit chunks // set bits to 0 based on how many CUs to carve out int32_t full_shifts = value / 32; int32_t remainder = value % 32; for (int32_t i = 0; i < full_shifts; i++) { - mask[i] = uint32_t{0x00000000}; + mask[i] = uint32_t{0xffffffff}; } - mask[full_shifts] = uint32_t{0xffffffff} << remainder; + mask[full_shifts] = uint32_t{0xffffffff} << (32 - remainder); // finally, create masked stream AT_CUDA_CHECK(hipExtStreamCreateWithCUMask(&stream, mask_size, &mask[0])); @@ -319,7 +325,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template - inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { + void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { // NOLINTNEXTLINE(bugprone-sizeof-expression) TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value))); } @@ -341,7 +347,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template - inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { + void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T))); } }; @@ -356,7 +362,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< descriptor_.reset(raw_descriptor); } template - inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { + void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T))); } }; @@ -391,7 +397,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { - if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } } else if constexpr (std::is_same_v>) { @@ -418,25 +424,40 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D abType = CUDA_R_16F; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; #ifndef USE_ROCM - if (!at::globalContext().allowFP16ReductionCuBLAS()) { - preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, - CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); + if (fp16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + uint32_t mask = + fp16_reduction == + at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK + ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | + CUBLASLT_REDUCTION_SCHEME_NONE) + : CUBLASLT_REDUCTION_SCHEME_NONE; + preference.setAttribute( + CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); } #endif } else if constexpr (std::is_same_v) { abType = CUDA_R_16BF; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; #ifndef USE_ROCM - if (!at::globalContext().allowBF16ReductionCuBLAS()) { - preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, - CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); + if (bf16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + uint32_t mask = + bf16_reduction == + at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK + ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | + CUBLASLT_REDUCTION_SCHEME_NONE) + : CUBLASLT_REDUCTION_SCHEME_NONE; + preference.setAttribute( + CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); } #endif } else { static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented"); } - globalContext().alertCuBLASConfigNotDeterministic(); cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -570,8 +591,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(double)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -583,8 +602,6 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(double)) { template <> void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(float)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -596,8 +613,6 @@ void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(float)) { template <> void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -611,8 +626,6 @@ void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::co template <> void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -626,8 +639,6 @@ void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::com template inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -699,8 +710,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP template inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); BGEMM_CHECK_ARGVALUES(at::BFloat16); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -1024,8 +1033,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1037,8 +1044,6 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { template <> void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(float)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1050,8 +1055,6 @@ void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(float)) { template <> void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1065,8 +1068,6 @@ void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::comp template <> void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1080,8 +1081,6 @@ void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::compl template inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1139,8 +1138,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( } if (prop->major >= 5) { cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; - if (!at::globalContext().allowFP16ReductionCuBLAS()) { - cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); + auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); + TORCH_CHECK(fp16_reduction != + at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK, + "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction(" + "..., allow_splitk=False) requires the cuBLASLt backend"); + if (fp16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + cublas_flags = static_cast( + cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } // Disallow fp16 reductions that could lead to unexpected overflow issues. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags)); @@ -1190,7 +1196,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE( template inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) { - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); cublasOperation_t opb = _cublasOpFromChar(transb); @@ -1200,8 +1205,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT GEMM_CHECK_ARGVALUES(at::BFloat16); #ifndef USE_ROCM cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH; - if (!at::globalContext().allowBF16ReductionCuBLAS()) { - cublas_flags = static_cast(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); + auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); + TORCH_CHECK(bf16_reduction != + at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK, + "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction(" + "..., allow_splitk=False) requires the cuBLASLt backend"); + if (bf16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + cublas_flags = static_cast( + cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION); } #endif #if defined(USE_ROCM) @@ -1290,7 +1302,7 @@ void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) } #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) { - if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100 + if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); } else{ at::native::gemm_internal_ck(CUDABLAS_GEMM_ARGS(float)); @@ -1579,7 +1591,7 @@ bool gemm_and_bias( computeType = CUBLAS_COMPUTE_64F; scaleType = CUDA_R_64F; } else if constexpr (std::is_same_v) { - if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { computeType = CUBLAS_COMPUTE_32F_FAST_TF32; } } else if constexpr (std::is_same_v) { @@ -1597,18 +1609,34 @@ bool gemm_and_bias( abType = CUDA_R_16F; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16F; #ifndef USE_ROCM - if (!at::globalContext().allowFP16ReductionCuBLAS()) { - preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, - CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS(); + if (fp16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + uint32_t mask = + fp16_reduction == + at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK + ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | + CUBLASLT_REDUCTION_SCHEME_NONE) + : CUBLASLT_REDUCTION_SCHEME_NONE; + preference.setAttribute( + CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); } #endif } else if constexpr (std::is_same_v) { abType = CUDA_R_16BF; cType = (std::is_same_v) ? CUDA_R_32F : CUDA_R_16BF; #ifndef USE_ROCM - if (!at::globalContext().allowBF16ReductionCuBLAS()) { - preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, - CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE); + auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS(); + if (bf16_reduction != + at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) { + uint32_t mask = + bf16_reduction == + at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK + ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | + CUBLASLT_REDUCTION_SCHEME_NONE) + : CUBLASLT_REDUCTION_SCHEME_NONE; + preference.setAttribute( + CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask); } #endif } @@ -1637,9 +1665,7 @@ bool gemm_and_bias( if (activation == GEMMAndBiasActivationEpilogue::RELU) { epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; } else if (activation == GEMMAndBiasActivationEpilogue::GELU) { -#if CUDA_VERSION >= 11040 || defined(USE_ROCM) epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; -#endif } if (bias != nullptr) { @@ -1837,6 +1863,8 @@ template bool gemm_and_bias( int64_t result_ld, GEMMAndBiasActivationEpilogue activation); +using at::blas::ScalingType; + int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) { switch (scaling_type) { case ScalingType::BlockWise1x32: @@ -1928,14 +1956,15 @@ void scaled_gemm( const void *result_scale_ptr, int64_t result_ld, ScalarType result_dtype, - bool use_fast_accum) { + bool use_fast_accum, + const std::optional& alpha) { // Note: see `cublasCommonArgs` for various non-intuitive manupulations // of input arguments to this function. -#if CUDA_VERSION >= 11080 || defined(USE_ROCM) const auto computeType = CUBLAS_COMPUTE_32F; const auto scaleType = CUDA_R_32F; - const float alpha_val = 1.0; - const float beta_val = 0.0; + // Note: alpha_val may change later depending on user-passed argument + float alpha_val = 1.0; + float beta_val = 0.0; CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa)); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb)); @@ -1954,8 +1983,8 @@ void scaled_gemm( #if ROCM_VERSION >= 70000 if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) { // TODO: add constraints based on hipblaslt internals - TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0), - "Matrix dimensions must be multiples of 32 for MX format. " + TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0), + "M, N must be multiples of 16 and K should be multiple of 128 for MX format. " "Got m=", m, ", n=", n, ", k=", k); } #endif @@ -2006,6 +2035,33 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype)); } + + // Handle user-passed alpha + float *alpha_ptr = &alpha_val; + float *beta_ptr = &beta_val; + + if (alpha.has_value()) { + auto& a = alpha.value(); + + // if device-tensor + if (a.is_cuda()) { + // NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be + // valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus + // we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically + // managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory + // for beta respectively. + float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr(); + at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat)); + user_alpha.copy_(a); + // Tell cublasLt we're using device-side pointers for alpha/beta + auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE; + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode); + alpha_ptr = user_alpha.data_ptr(); + beta_ptr = at::cuda::detail::get_cublas_device_zero(); + } else { + alpha_val = a.item(); + } + } // For other data types, use the get_scale_mode function based on scaling type // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt, // but we must invoke get_scale_mode anyways to trigger the version checks. @@ -2023,6 +2079,7 @@ void scaled_gemm( cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( ltHandle, computeDesc.descriptor(), @@ -2063,10 +2120,10 @@ void scaled_gemm( auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported( ltHandle, computeDesc.descriptor(), - &alpha_val, + alpha_ptr, Adesc.descriptor(), Bdesc.descriptor(), - &beta_val, + beta_ptr, Cdesc.descriptor(), Ddesc.descriptor(), all_algos[i].algo, @@ -2085,17 +2142,14 @@ void scaled_gemm( cublasStatus_t cublasStatus = cublasLtMatmul( ltHandle, computeDesc.descriptor(), - &alpha_val, + alpha_ptr, mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), - &beta_val, -#ifdef USE_ROCM + beta_ptr, + // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr -#else - nullptr, -#endif // ifdef USE_ROCM Cdesc.descriptor(), result_ptr, Ddesc.descriptor(), @@ -2133,8 +2187,6 @@ void scaled_gemm( " scaleType ", scaleType); return; -#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM) - TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); } void int8_gemm( @@ -2409,8 +2461,6 @@ void trsmBatched>( template <> void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2426,8 +2476,6 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { // gemv is bw bound, and does not benefit from TF32. But the precision // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2440,8 +2488,6 @@ void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); @@ -2455,8 +2501,6 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { // gemv is bw bound, and does not benefit from TF32. But the precision // loss still happens on TF32. So we disable it here. NoTF32Guard disable_tf32; - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t op = _cublasOpFromChar(trans); _cublasAdjustLdLevel2(m, n, &lda); diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index b235840418e2..0295948311a5 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -14,6 +14,7 @@ */ #include +#include #include namespace at::cuda::blas { @@ -136,15 +137,6 @@ void int8_gemm( int32_t* result_ptr, int64_t result_ld); -enum class ScalingType : std::uint8_t { - TensorWise, // fp32 scales - RowWise, // fp32 scales - BlockWise1x16, // fp8_e4m3fn scales - BlockWise1x32, // fp8_e8m0fnu scales - BlockWise1x128, // fp32 scales - BlockWise128x128, // fp32 scales -}; - void scaled_gemm( char transa, char transb, @@ -156,20 +148,21 @@ void scaled_gemm( int64_t mat1_ld, ScalarType mat1_dtype, ScalarType mat1_scale_dtype, - ScalingType mat1_scaling_type, + at::blas::ScalingType mat1_scaling_type, const void* mat2_ptr, const void* mat2_scale_ptr, int64_t mat2_ld, ScalarType mat2_dtype, ScalarType mat2_scale_dtype, - ScalingType mat2_scaling_type, + at::blas::ScalingType mat2_scaling_type, const void* bias_ptr, ScalarType bias_dtype, void* result_ptr, const void* result_scale_ptr, int64_t result_ld, ScalarType result_dtype, - bool use_fast_accum); + bool use_fast_accum, + const std::optional& alpha); #define CUDABLAS_BGEMM_ARGTYPES(Dtype) CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype) diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index f95faa94e611..2e387fbc264d 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -15,19 +15,19 @@ namespace cuda::detail { namespace { // Total number of gpus in the system. -static int64_t num_gpus; +int64_t num_gpus; // Ensures default_gens_cuda is initialized once. -static std::deque cuda_gens_init_flag; +std::deque cuda_gens_init_flag; // Default, global CUDA generators, one per GPU. -static std::vector default_gens_cuda; +std::vector default_gens_cuda; /* * Populates the global variables related to CUDA generators * Warning: this function must only be called once! */ -static void initCUDAGenVector() { +void initCUDAGenVector() { // Ensures we only call cudaGetDeviceCount only once. static bool num_gpu_init_flag [[maybe_unused]] = []() { num_gpus = static_cast(c10::cuda::device_count()); @@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) { offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4."); // Ensures the increment does not cause overflow. TORCH_INTERNAL_ASSERT( - offset_intragraph_ <= std::numeric_limits::max() - increment, + offset_intragraph_ <= std::numeric_limits::max() - increment, "Increment causes overflow in the offset value."); offset_intragraph_ += increment; } else { @@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() { */ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { // The RNG state comprises the seed, and an offset used for Philox. - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = seed_size + offset_size; + constexpr size_t seed_size = sizeof(uint64_t); + constexpr size_t offset_size = sizeof(int64_t); + constexpr size_t total_size = seed_size + offset_size; auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt); auto rng_state = state_tensor.data_ptr(); @@ -346,9 +346,9 @@ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { * and size of the internal state. */ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = seed_size + offset_size; + constexpr size_t seed_size = sizeof(uint64_t); + constexpr size_t offset_size = sizeof(int64_t); + constexpr size_t total_size = seed_size + offset_size; detail::check_rng_state(new_state); @@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) { */ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { - uint32_t offset = state_->offset_intragraph_; + uint64_t offset = state_->offset_intragraph_; state_->increase(increment); return PhiloxCudaState( state_->seed_extragraph_.data_ptr(), diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h index b0b77cb822a8..d4ab49382e7f 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h @@ -96,16 +96,16 @@ struct CUDAGraph; struct CUDAGeneratorState : public c10::intrusive_ptr_target { uint64_t seed_; uint64_t philox_offset_per_thread_; - uint32_t offset_intragraph_; + uint64_t offset_intragraph_; bool capturing_{}; std::unordered_set registered_graphs_; - at::TensorBase seed_extragraph_{}; - at::TensorBase offset_extragraph_{}; + at::TensorBase seed_extragraph_; + at::TensorBase offset_extragraph_; CUDAGeneratorState( uint64_t seed = default_rng_seed_val, uint64_t philox_offset_per_thread = 0, - uint32_t offset_intragraph = 0) + uint64_t offset_intragraph = 0) : seed_(seed), philox_offset_per_thread_(philox_offset_per_thread), offset_intragraph_(offset_intragraph) {} @@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { CUDAGeneratorImpl* clone_impl() const override; c10::intrusive_ptr state_; - std::atomic_flag no_reset_rnn_state_{}; + std::atomic_flag no_reset_rnn_state_; }; namespace cuda::detail { diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index c18ad66b2080..a32e7b4b86f0 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph { // the ID assigned by cuda during graph capture, // used to identify when a stream is participating in capture - CaptureId_t capture_id_ = -1; + CaptureId_t capture_id_ = 0; // uuid used to request a particular private mempool from CUDACachingAllocator. // By default, this will be set to {id_, 0}. diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h index 736fbe4ae50d..e00e50b38d2d 100644 --- a/aten/src/ATen/cuda/CUDASparse.h +++ b/aten/src/ATen/cuda/CUDASparse.h @@ -6,43 +6,15 @@ #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch) #endif -// cuSparse Generic API added in CUDA 10.1 -// Windows support added in CUDA 11.0 -#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32))) -#define AT_USE_CUSPARSE_GENERIC_API() 1 -#else -#define AT_USE_CUSPARSE_GENERIC_API() 0 -#endif - -// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0 -#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ - (CUSPARSE_VERSION < 12000) -#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1 -#else -#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0 -#endif - -#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \ - (CUSPARSE_VERSION >= 12000) -#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1 -#else -#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0 -#endif #if defined(USE_ROCM) // hipSparse const API added in v2.4.0 #if HIPSPARSE_VERSION >= 200400 -#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1 -#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 #define AT_USE_HIPSPARSE_GENERIC_API() 1 #else -#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 -#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1 #define AT_USE_HIPSPARSE_GENERIC_API() 1 #endif #else // USE_ROCM -#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 -#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 #define AT_USE_HIPSPARSE_GENERIC_API() 0 #endif // USE_ROCM diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp index 092314ac81f2..d5f04df55f9c 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp @@ -12,8 +12,6 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) { return cusparseDestroyDnMat(const_cast(dnMatDescr)); } -#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() - namespace { // If a specific GPU model does not provide native support for a given data @@ -210,6 +208,4 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6 descriptor_.reset(raw_descriptor); } -#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() - } // namespace at::cuda::sparse diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h index 7fc482f2a3fb..f12ef628e13f 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.h +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h @@ -35,7 +35,6 @@ class CuSparseDescriptor { std::unique_ptr> descriptor_; }; -#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() template struct ConstCuSparseDescriptorDeleter { void operator()(T* x) { @@ -58,7 +57,6 @@ class ConstCuSparseDescriptor { protected: std::unique_ptr> descriptor_; }; -#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS #if defined(USE_ROCM) using cusparseMatDescr = std::remove_pointer_t; @@ -123,39 +121,8 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE -#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() - cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); -#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() -class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor - : public CuSparseDescriptor { - public: - explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); -}; - -class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor - : public CuSparseDescriptor { - public: - explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); - cusparseDnMatDescr* unsafe_mutable_descriptor() const { - return const_cast(descriptor()); - } - cusparseDnMatDescr* unsafe_mutable_descriptor() { - return const_cast(descriptor()); - } -}; - -class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor - : public CuSparseDescriptor { - public: - explicit CuSparseDnVecDescriptor(const Tensor& input); -}; - -class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor - : public CuSparseDescriptor {}; - -#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor : public ConstCuSparseDescriptor< cusparseDnMatDescr, @@ -194,7 +161,6 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor : public ConstCuSparseDescriptor< cusparseSpMatDescr, &cusparseDestroySpMat> {}; -#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor : public CuSparseSpMatDescriptor { @@ -283,6 +249,4 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor } }; -#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() - } // namespace at::cuda::sparse diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 34aa15d0c06c..a2260d23b2d4 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -9,7 +9,6 @@ #include #include -#include namespace at::cuda { namespace { @@ -72,9 +71,20 @@ using Block = HostBlock; struct CUDACachingHostAllocatorImpl : public CachingHostAllocatorImpl { private: - std::unordered_map use_host_register; + ska::flat_hash_map use_host_register; void allocate_host_memory(size_t size, void** ptr) override { + // try allocating from reserve segment first before calling into expensive APIs + if (get_reserve_segment().initialized()) { + *ptr = get_reserve_segment().allocate(size); + if (*ptr != nullptr) { + return; + } + } + allocate_host_memory_slowpath(size, ptr); + } + + void allocate_host_memory_slowpath(size_t size, void** ptr) { // Pinned memory pointers allocated by any device can be directly used by // any other device, regardless of the current device at the time of // allocation, since we assume unified addressing. So we grab any existing @@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl } void free_block(Block* block) override { + // We never free blocks from the reserve segment + if (get_reserve_segment().initialized()) { + // Check if the block is from the reserve segment + if (get_reserve_segment().owns(block->ptr_)) { + return; + } + } + + free_block_slowpath(block); + } + + void free_block_slowpath(Block* block) { auto start = std::chrono::steady_clock::now(); // Users may change the allocator config at will. torch unit tests do this. // However, allocations using cudaHostRegister should use corresonding @@ -161,17 +183,26 @@ struct CUDACachingHostAllocatorImpl return true; } - bool pinned_use_background_threads() override { - return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: - pinned_use_background_threads(); - } - EventPool::Event create_event_internal(DeviceIndex idx) { // Leak the event pool to avoid shutdown issue. static auto* event_pool = new EventPool(); return event_pool->get(idx); } + PinnedReserveSegment& get_reserve_segment() { + static auto reserve_segment = [&]() { + if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) { + void *ptr; + size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024; + allocate_host_memory_slowpath(sz, &ptr); + return PinnedReserveSegment(ptr, sz); + } else { + return PinnedReserveSegment(); + } + } (); + return reserve_segment; + } + TaskThreadPool* getThreadPool() { static TaskThreadPool* pool = new TaskThreadPool( static_cast(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: @@ -186,15 +217,15 @@ struct CUDACachingHostAllocatorImpl size_t numThreads, size_t pageSize) { uintptr_t start = (uintptr_t)ptr + (size * i / numThreads); - uintptr_t end = (uintptr_t)start + (size / numThreads); + uintptr_t end = start + (size / numThreads); if (i == (numThreads - 1)) { end = (uintptr_t)ptr + size; } // pre-fault/map the pages by setting the first byte of the page uintptr_t alignedStart = - (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1)); - for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) { + ((start + pageSize - 1) & ~(pageSize - 1)); + for (uintptr_t p = alignedStart; p < (end); p += pageSize) { // NOLINTNEXTLINE(performance-no-int-to-ptr) memset((void*)p, 0, 1); } diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 329851341443..d7832c761ae5 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. if (!NoTF32Guard::should_disable_tf32() && - at::globalContext().float32Precision("cuda", "matmul") == "tf32") { + at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 23a3ff8c8958..7828c3917fc4 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -177,7 +177,6 @@ inline void segmented_sort_pairs( } } -#if CUB_SUPPORTS_UNIQUE_BY_KEY() template inline void unique_by_key( KeysInputIteratorT keys_in, ValuesInputIteratorT values_in, @@ -193,7 +192,6 @@ inline void unique_by_key( CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey, keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream()); } -#endif namespace impl { @@ -579,7 +577,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT #endif } -#if CUB_SUPPORTS_SCAN_BY_KEY() template inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) { @@ -607,7 +604,6 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT #endif } -#endif template void unique(InputIteratorT input, OutputIteratorT output, diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh index b80951269209..0d76ae6e8dcf 100644 --- a/aten/src/ATen/cuda/cub_definitions.cuh +++ b/aten/src/ATen/cuda/cub_definitions.cuh @@ -28,22 +28,6 @@ #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false #endif -// cub support for UniqueByKey is added to cub 1.16 in: -// https://github.com/NVIDIA/cub/pull/405 -#if CUB_VERSION >= 101600 -#define CUB_SUPPORTS_UNIQUE_BY_KEY() true -#else -#define CUB_SUPPORTS_UNIQUE_BY_KEY() false -#endif - -// cub support for scan by key is added to cub 1.15 -// in https://github.com/NVIDIA/cub/pull/376 -#if CUB_VERSION >= 101500 -#define CUB_SUPPORTS_SCAN_BY_KEY() 1 -#else -#define CUB_SUPPORTS_SCAN_BY_KEY() 0 -#endif - // cub support for cub::FutureValue is added to cub 1.15 in: // https://github.com/NVIDIA/cub/pull/305 #if CUB_VERSION >= 101500 diff --git a/aten/src/ATen/cuda/detail/BLASConstants.cu b/aten/src/ATen/cuda/detail/BLASConstants.cu new file mode 100644 index 000000000000..967388044705 --- /dev/null +++ b/aten/src/ATen/cuda/detail/BLASConstants.cu @@ -0,0 +1,54 @@ +#include +#include +#include + +#include + +namespace at { +namespace cuda { +namespace detail { + +__device__ __constant__ float cublas_one_device; +__device__ __constant__ float cublas_zero_device; + +float *get_cublas_device_one() { + static c10::once_flag init_flag; + + c10::call_once(init_flag, []() { + const float one = 1.f; + AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float))); + }); + + float *ptr; + AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast(&ptr), cublas_one_device)); + return ptr; +} + +float *get_cublas_device_zero() { + static c10::once_flag init_flag; + + c10::call_once(init_flag, []() { + const float zero = 0.f; + AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float))); + }); + + float *ptr; + AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast(&ptr), cublas_zero_device)); + return ptr; +} + +float *get_user_alpha_ptr() { + static float *alpha_ptr; + + static c10::once_flag init_flag; + + c10::call_once(init_flag, []() { + AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float))); + }); + + return alpha_ptr; +} + +} // namespace detail +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/detail/BLASConstants.h b/aten/src/ATen/cuda/detail/BLASConstants.h new file mode 100644 index 000000000000..d62aaf1330ee --- /dev/null +++ b/aten/src/ATen/cuda/detail/BLASConstants.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace at::cuda::detail { + +float *get_cublas_device_one(); +float *get_cublas_device_zero(); +float *get_user_alpha_ptr(); + +} // namespace at::cuda::detail diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 72826b584792..b7f80101d926 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -281,6 +281,9 @@ bool CUDAHooks::compiledWithMIOpen() const { bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() + if (!hasCUDA()) { + return false; + } // NOTE: extra parenthesis around numbers disable clang warnings about // dead code return true; @@ -291,6 +294,9 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const { bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { #if AT_CUDNN_ENABLED() + if (!hasCUDA()) { + return false; + } cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); // Check for Volta cores if (prop->major >= 7) { @@ -305,6 +311,26 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const { bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const { #if AT_CUDNN_ENABLED() + if (!hasCUDA()) { + return false; + } + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + // Check for Volta cores + if (prop->major >= 8) { + return true; + } else { + return false; + } +#else + return false; +#endif +} + +bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const { +#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300) + if (!hasCUDA()) { + return false; + } cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); // Check for Volta cores if (prop->major >= 8) { diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index 2780369a37b7..8d3d1db00392 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)()); // The real implementation of CUDAHooksInterface struct CUDAHooks : public at::CUDAHooksInterface { - CUDAHooks(at::CUDAHooksArgs) {} + CUDAHooks(at::CUDAHooksArgs /*unused*/) {} void init() const override; Device getDeviceFromPtr(void* data) const override; bool isPinnedPtr(const void* data) const override; @@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool supportsDilatedConvolutionWithCuDNN() const override; bool supportsDepthwiseConvolutionWithCuDNN() const override; bool supportsBFloat16ConvolutionWithCuDNNv8() const override; + bool supportsBFloat16RNNWithCuDNN() const override; bool hasCUDART() const override; long versionCUDART() const override; long versionCuDNN() const override; diff --git a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h index 1f80c863b639..71a344d281d2 100644 --- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h +++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h @@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this 0) { + if(!my_handles.empty()) { auto parent = weak_parent.lock(); if (!parent) { // If this thread exits after atexit handlers have completed, the diff --git a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh index 231cd167cacb..7de0321256fd 100644 --- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh +++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh @@ -19,7 +19,7 @@ struct PhiloxCudaState { // Called if graph capture is underway PhiloxCudaState(int64_t* seed, int64_t* offset_extragraph, - uint32_t offset_intragraph) { + uint64_t offset_intragraph) { seed_.ptr = seed; offset_.ptr = offset_extragraph; offset_intragraph_ = offset_intragraph; @@ -36,7 +36,7 @@ struct PhiloxCudaState { Payload seed_{}; Payload offset_{}; - uint32_t offset_intragraph_ = 0; + uint64_t offset_intragraph_ = 0; bool captured_ = false; }; diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h index 6d19907aba4a..5d9e33b2b5b2 100644 --- a/aten/src/ATen/cuda/tunable/GemmCommon.h +++ b/aten/src/ATen/cuda/tunable/GemmCommon.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -29,7 +30,7 @@ namespace at::cuda::tunable { -using at::cuda::blas::ScalingType; +using at::blas::ScalingType; enum class BlasOp { N = 0, @@ -150,6 +151,7 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) { BLASType = "unknown"; } return BLASType; + } // Similar to Compute Type in GemmRocblas.h @@ -162,7 +164,7 @@ inline std::string ComputeTypeFor() { // ROCBLAS and hipBLASLt. template <> inline std::string ComputeTypeFor() { - if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) { return "f32_r"; } else { return "xf32_r"; @@ -244,33 +246,25 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio namespace detail { -static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) { +static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) { + + if (!config.enabled) { + return true; // skip when disabled + } + auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); - // comparison done as 1D tensor at::Tensor ref = at::from_blob(c, {size}, options); at::Tensor oth = at::from_blob(other_c, {size}, options); at::Tensor ref_float = ref.to(at::kFloat); at::Tensor oth_float = oth.to(at::kFloat); - std::vector atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; - std::vector rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; - double last_succeed_atol = 1; - double last_succeed_rtol = 1; - for (auto& atol : atols) { - for (auto& rtol : rtols) { - if (at::allclose(ref_float, oth_float, rtol, atol)) { - last_succeed_atol = atol; - last_succeed_rtol = rtol; - } - } - } - if (last_succeed_atol == 1) { - return false; - } - else { - TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol); - } - return true; + const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol); + if (ok) { + TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol); + } else { + TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol); + } + return ok; } } @@ -355,8 +349,10 @@ struct GemmParams : OpParams { } TuningStatus NumericalCheck(GemmParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; - return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; } char transa{}; @@ -449,8 +445,10 @@ struct GemmAndBiasParams : OpParams { } TuningStatus NumericalCheck(GemmAndBiasParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; - return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; } char transa{}; @@ -546,8 +544,10 @@ struct GemmStridedBatchedParams : OpParams { } TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); auto c_dtype = c10::CppTypeToScalarType::value; - return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; } char transa{}; @@ -663,7 +663,9 @@ struct ScaledGemmParams : OpParams { } TuningStatus NumericalCheck(ScaledGemmParams *other) { - return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL; + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; } char transa{}; diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h index 1a0d96899906..29affa2d21ff 100644 --- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable { } hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; - if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; } HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h index d7c45dc91c21..60eaa2e4d475 100644 --- a/aten/src/ATen/cuda/tunable/GemmRocblas.h +++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h @@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable> { TuningStatus Call(const GemmParams* params) override { auto input_output_type = RocBlasDataTypeFor(); - if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r) + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) return FAIL; // no support for TF32 in rocBLAS auto compute_type = RocBlasComputeTypeFor(); auto h_a = DoCastForHalfOrBfloat16(params->alpha); @@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable> TuningStatus Call(const GemmStridedBatchedParams* params) override { auto input_output_type = RocBlasDataTypeFor(); - if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r) + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) return FAIL; // no support for TF32 in rocBLAS auto compute_type = RocBlasComputeTypeFor(); auto h_a = DoCastForHalfOrBfloat16(params->alpha); diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md index b30040b7e284..db31af9259a5 100644 --- a/aten/src/ATen/cuda/tunable/README.md +++ b/aten/src/ATen/cuda/tunable/README.md @@ -145,7 +145,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins | PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. | | PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. | | PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. | -| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. | +| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". | | PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. | | PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. | | PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. | @@ -173,10 +173,9 @@ All python APIs exist in the `torch.cuda.tunable` module. | get_max_tuning_iterations() -> int | | | set_filename(filename: str, insert_device_ordinal: bool = False) -> None | | | get_filename() -> str | | +| set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5. | get_results() -> Tuple[str, str, str, float] | | | get_validators() -> Tuple[str, str] | | -| write_file_on_exit(val: bool) -> None | Default is True. | -| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). | | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). | | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. | | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. | diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 3511e48ae061..c5ea0c6dd17c 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -107,14 +107,30 @@ void TuningResultsManager::AddImpl(const std::string& op_signature, } void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) { - std::scoped_lock l{lock_}; + bool is_new = false; + ResultEntry inserted = ResultEntry::Null(); - auto it = results_.find(op_signature); - if (it == results_.end()) { - it = results_.insert({op_signature, {}}).first; + // ---- mutate maps under results lock ---- + { + std::scoped_lock l{lock_}; + auto& km = results_[op_signature]; // creates if missing + is_new = (km.find(params_signature) == km.end()); + AddImpl(op_signature, params_signature, std::move(best), km); + if (is_new) { + inserted = km.at(params_signature); // snapshot for I/O after unlocking + } + } + if (!is_new) return; // only write once per unique (op, params) + + TuningContext* ctx = getTuningContext(); + if (ctx->IsTuningEnabled() && !ctx->IsRecordUntunedEnabled()) { + InitRealtimeAppend(ctx->GetFilename(), ctx->GetTuningResultsValidator().GetAllValidators()); + + if (is_new && realtime_out_ && realtime_out_->good()) { + AppendResultLine(op_signature, params_signature, inserted); + } } - AddImpl(op_signature, params_signature, std::move(best), it->second); } void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, @@ -150,6 +166,77 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std } } +void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map& validators) { + std::scoped_lock fl{realtime_file_mutex_}; + + if (realtime_out_ && realtime_out_->good() && realtime_filename_ == filename) { + return; + } + + if (realtime_out_ && realtime_filename_ != filename) { + realtime_out_->flush(); + realtime_out_->close(); + realtime_out_.reset(); + validators_written_ = false; + } + + bool file_exists = false; + bool file_empty = true; + + { + std::ifstream check_file(filename); + if (check_file.good()) { + file_exists = true; + file_empty = (check_file.peek() == std::ifstream::traits_type::eof()); + } + } + + realtime_out_ = std::make_unique(filename, std::ios::out | std::ios::app); + + if (!realtime_out_->good()) { + TORCH_WARN("TunableOp realtime append: failed to open '", filename,"'"); + realtime_out_.reset(); + return; + } + + if(!file_exists || file_empty) { + for(const auto& [key, val] : validators) { + (*realtime_out_) << "Validator," << key << "," << val << std::endl; + realtime_out_->flush(); + } + validators_written_ = true; + + TUNABLE_LOG2("Wrote validators to realtime output file"); + } + + realtime_filename_ = filename; +} + +void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std::string& param_sig, const ResultEntry& result) { + std::scoped_lock fl{realtime_file_mutex_}; + + if(!realtime_out_ || !realtime_out_->good()) { + return; + } + + (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl; + realtime_out_->flush(); //ensure immediate write to disk + + TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result); +} + +void TuningResultsManager::CloseRealtimeAppend() { + std::scoped_lock fl{realtime_file_mutex_}; + + + if(realtime_out_) { + realtime_out_->flush(); + realtime_out_->close(); + realtime_out_.reset(); + TUNABLE_LOG2("Closed realtime output file"); + } +} + void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) { std::scoped_lock l{lock_}; @@ -396,7 +483,6 @@ TuningContext::TuningContext() : tuning_enable_{true}, record_untuned_enable_{false}, manager_initialized_{false}, - write_file_on_exit_{true}, numerics_check_enable_{false}, max_tuning_duration_ms_{30}, max_tuning_iterations_{100}, @@ -404,8 +490,6 @@ TuningContext::TuningContext() : max_warmup_iterations_{0}, icache_flush_{true}, rotating_buffer_size_{-1}, - filename_{}, - untuned_file_{}, results_count_from_input_file_{0}, is_shutting_down_{false} { @@ -419,20 +503,8 @@ TuningContext::~TuningContext() { // but doesn't do any computation itself. return; } - auto filename = GetFilename(); - if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) { - if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) { - if (results_count_from_input_file_ > 0) { - TUNABLE_LOG1("additional tuning results available, rewriting file ", filename); - } - else { - TUNABLE_LOG1("writing file ", filename); - } - if (!WriteFile(filename)) { - TUNABLE_LOG1("failed to write file ", filename); - } - } - } + TUNABLE_LOG1("Closing File"); + GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now. if (untuned_file_.good()) { untuned_file_.close(); @@ -513,20 +585,54 @@ std::ofstream& TuningContext::GetUntunedFile(){ return untuned_file_; } -void TuningContext::WriteFileOnExit(bool value) { - write_file_on_exit_ = value; -} void TuningContext::EnableNumericsCheck(bool value) { numerics_check_enable_ = value; } -bool TuningContext::IsNumericsCheckEnabled() const { - const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); - if (env == "1") { - return true; +NumericalCheckConfig TuningContext::GetNumericalCheckConfig() const { + const auto env_opt = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); + + if (!env_opt.has_value()) { + return numerics_cfg_; + } + + const std::string& env = env_opt.value(); + + if (env == "0") { + return NumericalCheckConfig(false, 1e-5, 1e-5); + } + + const size_t underscore = env.find('_'); + + TORCH_CHECK( + underscore != std::string::npos, + "Invalid PYTORCH_TUNABLEOP_NUMERICAL_CHECK format. " + "Expected 'atol_rtol', got: ", + env); + + double atol = 0.0; + double rtol = 0.0; + + try { + atol = std::stod(env.substr(0, underscore)); + rtol = std::stod(env.substr(underscore + 1)); + } catch (const std::exception& e) { + TORCH_CHECK(false, "Failed to parse PYTORCH_TUNABLEOP_NUMERICAL_CHECK: ", e.what()); } - return numerics_check_enable_; + + TORCH_CHECK( atol > 0.0 && rtol > 0.0, "Tolerance values must be positive. atol=", atol, ", rtol=", rtol); + return NumericalCheckConfig(true, atol, rtol); +} + +void TuningContext::SetNumericalCheckConfig(bool enabled, double atol, double rtol) { + TORCH_CHECK(atol > 0.0 && rtol > 0.0, "Numerical check tolerances must be positive"); + numerics_cfg_ = {enabled, atol, rtol}; +} + +bool TuningContext::IsNumericsCheckEnabled() const { + const auto cfg = GetNumericalCheckConfig(); + return cfg.enabled || numerics_check_enable_; } void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) { @@ -636,11 +742,6 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() { auto filename = GetFilename(); if (!filename.empty() && !IsRecordUntunedEnabled()) { ReadFile(filename); - // attempt immediately to open file for writing to catch errors early - std::ofstream file(filename, std::ios::out | std::ios::app); - if (!file.good()) { - TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved"); - } } }); return manager_; @@ -746,27 +847,6 @@ bool TuningContext::ReadFile(const std::string& filename_) { return true; } -bool TuningContext::WriteFile(const std::string& filename_) { - std::string filename = filename_.empty() ? GetFilename() : filename_; - std::ofstream file(filename, std::ios::out | std::ios::trunc); - if (!file.good()) { - TUNABLE_LOG1("error opening tuning results file for writing ", filename); - return false; - } - auto validators = GetTuningResultsValidator().GetAllValidators(); - for (const auto& [key, val] : validators) { - file << "Validator," << key << "," << val << std::endl; - } - auto results = GetTuningResultsManager().Dump(); - for (const auto& [op_sig, kernelmap] : results) { - for (const auto& [param_sig, result] : kernelmap) { - file << op_sig << "," << param_sig << "," << result << std::endl; - } - } - file.close(); - return true; -} - namespace { struct MaybeDelete { diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h index 5e885d4764d2..17b4ea34ddf6 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.h +++ b/aten/src/ATen/cuda/tunable/Tunable.h @@ -103,10 +103,24 @@ class TORCH_CUDA_CPP_API TuningResultsManager { void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, const std::string& params_signature, const std::string& blas_signature); + + void InitRealtimeAppend( + const std::string& filename, + const std::unordered_map& validators); + + void AppendResultLine(const std::string& op_sig, + const std::string& param_sig, + const ResultEntry& result); + + void CloseRealtimeAppend(); // For clean shutdown private: std::mutex lock_; + std::mutex realtime_file_mutex_; + std::unique_ptr realtime_out_; + std::string realtime_filename_; ResultsMap results_; UntunedMap untuned_results_; + bool validators_written_ = false; }; @@ -134,6 +148,16 @@ class TORCH_CUDA_CPP_API TuningResultsValidator { GetValidateFuncs validators_; }; +struct NumericalCheckConfig { + bool enabled{false}; + double atol{1e-5}; + double rtol{1e-5}; + + NumericalCheckConfig() = default; + NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {} +}; + + class TORCH_CUDA_CPP_API TuningContext { public: TuningContext(); @@ -155,6 +179,8 @@ class TORCH_CUDA_CPP_API TuningContext { void EnableNumericsCheck(bool value); bool IsNumericsCheckEnabled() const; + void SetNumericalCheckConfig(bool enabled, double atol, double rtol); + NumericalCheckConfig GetNumericalCheckConfig() const; void SetMaxTuningDurationMs(int max_duration_ms); int GetMaxTuningDurationMs() const; @@ -185,10 +211,7 @@ class TORCH_CUDA_CPP_API TuningContext { void SetFilename(const std::string& filename, bool insert_device_ordinal=false); std::string GetFilename() const; - void WriteFileOnExit(bool value); - bool ReadFile(const std::string& filename={}); - bool WriteFile(const std::string& filename={}); template void Log(int level, Types... args) { @@ -207,7 +230,6 @@ class TORCH_CUDA_CPP_API TuningContext { bool tuning_enable_; bool record_untuned_enable_; bool manager_initialized_; - bool write_file_on_exit_; bool numerics_check_enable_; int max_tuning_duration_ms_; int max_tuning_iterations_; @@ -222,6 +244,8 @@ class TORCH_CUDA_CPP_API TuningContext { std::ofstream untuned_file_; size_t results_count_from_input_file_; bool is_shutting_down_; + + NumericalCheckConfig numerics_cfg_{}; }; TORCH_CUDA_CPP_API TuningContext* getTuningContext(); diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h index d941c230630c..c014d1ea569c 100644 --- a/aten/src/ATen/cuda/tunable/TunableGemm.h +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h @@ -109,7 +109,8 @@ class DefaultScaledGemmOp : public Callable> { params->c_scale_ptr, params->ldc, params->c_dtype, - params->use_fast_accum); + params->use_fast_accum, + std::nullopt /* alpha */); return OK; } }; diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h index 6ca9e213e148..d7bf0e6d93d8 100644 --- a/aten/src/ATen/cuda/tunable/TunableOp.h +++ b/aten/src/ATen/cuda/tunable/TunableOp.h @@ -29,7 +29,7 @@ template class Callable { public: virtual ~Callable() = default; - virtual TuningStatus Call(const ParamsT*) { + virtual TuningStatus Call(const ParamsT* /*unused*/) { return FAIL; } virtual TuningStatus IsSupported(const ParamsT* params) { @@ -267,27 +267,10 @@ class TunableOp { for (size_t i = 0; i < op_names_.size(); i++) { auto* candidate = ops_[op_names_[i]].get(); // borrow pointer - if (do_numerics_check) { - ParamsT* numerical_params = params->DeepCopy(false); - auto status = candidate->Call(numerical_params); - if (status != OK) { - numerical_params->Delete(); - TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); - continue; - } - status = reference_params->NumericalCheck(numerical_params); - numerical_params->Delete(); - if (status != OK) { - TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); - continue; - } - } - else { - auto status = candidate->Call(reusable_params[0]); - if (status != OK) { - TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); - continue; - } + auto status = candidate->Call(reusable_params[0]); + if (status != OK) { + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; } // collect a small profile @@ -310,6 +293,22 @@ class TunableOp { continue; } + if (do_numerics_check) { + ParamsT* numerical_params = params->DeepCopy(false); + auto status = candidate->Call(numerical_params); + if (status != OK) { + numerical_params->Delete(); + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + status = reference_params->NumericalCheck(numerical_params); + numerical_params->Delete(); + if (status != OK) { + TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + } + // for warmup does user set max duration, max iters, or both? // warmup is skipped by default, i.e. warmup_iter = 0 // warmup will be set to the non-zero value of max_warmup_duration diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index 2fc1867d276d..dbd178e0f8ee 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo size[i] = (int) t.size(i); } for (const auto i : c10::irange(dim, pad)) { - size[i] = (int) 1; + size[i] = 1; } dim = std::max(dim, pad); cudnnTensorFormat_t filter_format{}; diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp index f6e080c433d6..f612436f5672 100644 --- a/aten/src/ATen/cudnn/Types.cpp +++ b/aten/src/ATen/cudnn/Types.cpp @@ -2,6 +2,8 @@ #include +#include + namespace at::native { cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { @@ -20,9 +22,10 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) { } else if (dtype == at::kByte) { return CUDNN_DATA_UINT8; } - std::string msg("getCudnnDataTypeFromScalarType() not supported for "); - msg += toString(dtype); - throw std::runtime_error(msg); + TORCH_CHECK(false, + "getCudnnDataTypeFromScalarType() not supported for ", + toString(dtype) + ); } cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) { diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 00573e3cf701..f1f205691747 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { return false; } + virtual bool supportsBFloat16RNNWithCuDNN() const { + return false; + } + virtual long versionCuDNN() const { TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP); } diff --git a/aten/src/ATen/detail/HPUHooksInterface.h b/aten/src/ATen/detail/HPUHooksInterface.h index 8cf9502a7e1b..3240ff4dac13 100644 --- a/aten/src/ATen/detail/HPUHooksInterface.h +++ b/aten/src/ATen/detail/HPUHooksInterface.h @@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface { false, "Cannot get device of pointer on HPU without HPU backend"); } - bool isPinnedPtr(const void*) const override { + bool isPinnedPtr(const void* /*data*/) const override { return false; } diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h index 70fbf3135a3c..ee23a0320f7c 100644 --- a/aten/src/ATen/functorch/BatchRulesHelper.h +++ b/aten/src/ATen/functorch/BatchRulesHelper.h @@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper -Tensor& unary_inplace_batch_rule(Tensor& self, std::optional, ExtraArgs... extra_args) { +Tensor& unary_inplace_batch_rule(Tensor& self, std::optional /*unused*/, ExtraArgs... extra_args) { INVOKE(self, Method)(std::forward(extra_args)...); return self; } diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp index 4f74468af085..cab76b3af9ad 100644 --- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp +++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp @@ -39,7 +39,7 @@ Tensor vdot_decomp(const Tensor& A, const Tensor& B) { // NB: I wrote this like this because we *might* want its for a future matmul // batch rule that isn't decomposed... // "tv" = tensor @ vector -static std::tuple> tv_batch_rule( +std::tuple> tv_batch_rule( const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { if (self_bdim && other_bdim) { @@ -66,7 +66,7 @@ static std::tuple> tv_batch_rule( TORCH_INTERNAL_ASSERT(false, "can't get here"); } -static std::tuple> mv_batch_rule( +std::tuple> mv_batch_rule( const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -79,7 +79,7 @@ static std::tuple> mv_batch_rule( return tv_batch_rule(self, self_bdim, other, other_bdim); } -static std::tuple> mm_batch_rule( +std::tuple> mm_batch_rule( const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -94,7 +94,7 @@ static std::tuple> mm_batch_rule( return std::make_tuple( at::matmul(self_, other_), 0 ); } -static std::tuple> bmm_batch_rule( +std::tuple> bmm_batch_rule( const Tensor& self, std::optional self_bdim, const Tensor& other, std::optional other_bdim) { auto self_logical_rank = rankWithoutBatchDim(self, self_bdim); @@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper; template struct LinalgCheckMatrixUnaryRuleHelper> { - static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional batch_dim) { + static Tensor check_and_reshape_input(const Tensor& tensor, std::optional batch_dim) { TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions."); return moveBatchDimToFront(tensor, batch_dim); } @@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper; template struct LinalgCheckMatrixBinaryRuleHelper> { - static inline std::tuple check_inputs_and_reshape_inputs( + static std::tuple check_inputs_and_reshape_inputs( const Tensor& first, std::optional first_bdim, const Tensor& second, std::optional second_bdim) { TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2, @@ -250,7 +250,7 @@ struct LinalgCheckMatrixBinaryRuleHelper> } }; -static void expect_at_least_rank( +void expect_at_least_rank( const Tensor& tensor, std::optional tensor_bdim, int64_t expected_rank, @@ -472,7 +472,7 @@ atol_rtol_tensor_batch_rule( return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0); } -static std::tuple> +std::tuple> pinv_batch_rule( const Tensor& input, std::optional input_bdim, const std::optional& atol, const std::optional atol_bdim, const std::optional& rtol, diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp index 6e63708a90f4..5fba8d257ceb 100644 --- a/aten/src/ATen/functorch/BatchRulesModules.cpp +++ b/aten/src/ATen/functorch/BatchRulesModules.cpp @@ -213,40 +213,22 @@ static cudnn_grid_sample_backward_batch_rule( return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size); } -// TODO: replace with targetable functionalization +// uses functional formulation for one_hot under vmap to be compatible with +// fakeTensor/dynamic shapes and compiled functorch transforms. +// mirrors the meta path in aten/src/ATen/native/Onehot.cpp, +// but requires explicit positive num_classes under vmap to avoid +// data-dependent output shapes. static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) { TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor."); - auto shape = self.sym_sizes().vec(); - - // empty tensor could be converted to one hot representation, - // but shape inference is not possible. - if (self.sym_numel() == 0) { - if (num_classes <= 0) { - TORCH_CHECK(false, "Can not infer total number of classes from empty tensor."); - } else { - shape.emplace_back(num_classes); - return at::empty_symint(shape, self.options()); - } - } + // disallow implicit inference under vmap; this would be data-dependent + // and is intentionally guarded by Dynamo in torch/_dynamo/variables/torch.py. TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please " "provide an explicit positive num_classes argument."); - // Disabling all of the following checks. This is OK because scatter has checks too. - // Maybe one_hot should be a primitive wrt autograd so we don't have to deal with this. - // // non-empty tensor - // if (self.device().type() != at::kCUDA) { - // //for cuda, rely on device assert thrown by scatter - // TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative."); - // } - // if (self.device().type() != at::kCUDA) { - // //rely on device asserts from scatter to avoid sync here - // TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes."); - // } - - shape.emplace_back(num_classes); - Tensor ret = at::zeros_symint(shape, self.options()); - return ret.scatter(-1, self.unsqueeze(-1), 1); + const auto options = self.options(); + at::Tensor index = at::arange(num_classes, options); + return at::eq(self.unsqueeze(-1), index).to(at::kLong); } template diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp index 14f03bd17f4d..f5c770371de8 100644 --- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp @@ -12,13 +12,14 @@ #include #include #include +#include // NOLINTBEGIN(bugprone-unchecked-optional-access) namespace at::functorch { namespace { -static bool any_has_value(ArrayRef> bdims) { +bool any_has_value(ArrayRef> bdims) { for (const auto& bdim : bdims) { if (bdim.has_value()) { return true; @@ -27,7 +28,7 @@ static bool any_has_value(ArrayRef> bdims) { return false; } -static int64_t get_num_leading_nones(ArrayRef> indices) { +int64_t get_num_leading_nones(ArrayRef> indices) { int64_t result = 0; for (const auto& idx : indices) { if (!idx.has_value() || !idx->defined()) { @@ -39,7 +40,7 @@ static int64_t get_num_leading_nones(ArrayRef> indices) { return result; } -static int64_t get_max_index_logical_dim( +int64_t get_max_index_logical_dim( ArrayRef> indices, ArrayRef> indices_bdims) { int64_t max_logical_dim = -1; @@ -56,7 +57,7 @@ static int64_t get_max_index_logical_dim( return max_logical_dim; } -static std::vector> batchIndices( +std::vector> batchIndices( at::TensorOptions options, ArrayRef> indices, ArrayRef> indices_bdims, @@ -94,9 +95,10 @@ static std::vector> batchIndices( if (index.has_value() && index->sym_numel() != 0) { const auto idx_bdim = indices_bdims[i]; indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank)); - if (index.value().dtype() == kBool && indices_bdims[i].has_value()) { - throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."); - } + TORCH_CHECK( + !(index.value().dtype() == kBool) || !indices_bdims[i].has_value(), + "vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask." + ); } else { indices_.push_back(index); } @@ -124,7 +126,7 @@ static std::vector> batchIndices( // Define an "advanced index" to be a selection object that is // a non-trivial Tensor (i.e. it does not represent :). -static bool is_advanced_index(const std::optional& idx) { +bool is_advanced_index(const std::optional& idx) { if (!idx.has_value()) { return false; } @@ -135,7 +137,7 @@ static bool is_advanced_index(const std::optional& idx) { } // See NOTE: [advanced indices adjacent] for definition -static bool are_advanced_indices_adjacent(ArrayRef> indices) { +bool are_advanced_indices_adjacent(ArrayRef> indices) { int64_t num_advanced_indices_regions = 0; bool in_advanced_indices_region = false; for (const auto& idx : indices) { @@ -163,7 +165,7 @@ static bool are_advanced_indices_adjacent(ArrayRef> indice // - result: Tensor[B, 4, 5, 6, 2, 3, 7, 8] // ------- ---- // region2 region1 -static Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) { +Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) { VmapDimVector permutation(tensor.dim(), 0); std::iota(permutation.begin(), permutation.end(), 0); std::rotate( @@ -551,7 +553,7 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List return self; } -static Tensor maybe_permute_values( +Tensor maybe_permute_values( const Tensor& values, ArrayRef> orig_indices, ArrayRef> orig_indices_bdims) { @@ -1050,7 +1052,7 @@ std::tuple> index_add_batch_rule( other, other_bdim, alpha, false); } -static std::tuple binary_pointwise_align( +std::tuple binary_pointwise_align( const Tensor & self, std::optional self_bdim, const Tensor & mask, diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp index cd1d0e1487fb..08db1d202b4e 100644 --- a/aten/src/ATen/functorch/BatchRulesViews.cpp +++ b/aten/src/ATen/functorch/BatchRulesViews.cpp @@ -346,7 +346,7 @@ std::tuple> slice_batch_rule( return std::make_tuple(std::move(result), 0); } -static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +bool is_allowed_dim_on_scalar_tensor(int64_t dim) { return dim == 0 || dim == -1; } diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index 3eccc94d3ea6..985b289b3fe0 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -160,6 +160,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({ DispatchKey::CUDA, DispatchKey::CPU, DispatchKey::PrivateUse1, + DispatchKey::SparseCPU, + DispatchKey::SparseCUDA, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, }); inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp index 4ec902b668e4..69af08a7bd7c 100644 --- a/aten/src/ATen/functorch/DynamicLayer.cpp +++ b/aten/src/ATen/functorch/DynamicLayer.cpp @@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh) static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - return dynamicLayerBack(op, stack, true); + dynamicLayerBack(op, stack, true); } static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - return dynamicLayerBack(op, stack, false); + dynamicLayerBack(op, stack, false); } TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) { diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h index 1c76230fb455..2a0e40199449 100644 --- a/aten/src/ATen/functorch/Interpreter.h +++ b/aten/src/ATen/functorch/Interpreter.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -106,9 +107,10 @@ struct VmapInterpreterMeta { template friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) { - if (json_t.batchSize_.is_heap_allocated()) { - throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet"); - } + TORCH_CHECK( + !json_t.batchSize_.is_heap_allocated(), + "Serialization for heap-allocated SymInt is not implemented yet" + ); json_j["batchSize"] = json_t.batchSize_.as_int_unchecked(); json_j["randomness"] = static_cast(json_t.randomness_); } @@ -302,7 +304,7 @@ struct Interpreter { } else if (meta.contains("Functionalize")) { json_t.meta_.emplace(meta["Functionalize"].template get()); } else { - throw std::runtime_error("unknown interpreter metadata type"); + TORCH_CHECK(false, "unknown interpreter metadata type"); } } diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp index 69517407e682..22a15c168445 100644 --- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp @@ -68,18 +68,18 @@ namespace at::functorch { namespace{ // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor. -static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { +bool is_allowed_dim_on_scalar_tensor(int64_t dim) { return dim == 0 || dim == -1; } -static int64_t get_current_level() { +int64_t get_current_level() { auto maybe_level = maybeCurrentDynamicLayer(); TORCH_INTERNAL_ASSERT(maybe_level.has_value()); return maybe_level->layerId(); } // This check should probably go into the dispatcher... -static bool participatesInCurrentLevel(const Tensor& self) { +bool participatesInCurrentLevel(const Tensor& self) { auto current_level = get_current_level(); auto* maybe_batched_impl = maybeGetBatchedImpl(self); if (!maybe_batched_impl) { @@ -90,7 +90,7 @@ static bool participatesInCurrentLevel(const Tensor& self) { return self_level == current_level; } -static bool participatesInCurrentLevel(ITensorListRef self) { +bool participatesInCurrentLevel(ITensorListRef self) { for (const Tensor& tensor : self) { if (participatesInCurrentLevel(tensor)) { return true; @@ -285,7 +285,7 @@ std::vector unbind_batching_rule(const Tensor& self, int64_t dim) { // given (sizes, strides, storage_offset) returns the maximum location that // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors // with zero-size dims). -static std::optional maximum_indexable_location( +std::optional maximum_indexable_location( c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) { auto result = native::storage_size_for(sizes, strides); if (result == 0) { @@ -298,7 +298,7 @@ static std::optional maximum_indexable_location( // This checks that the range of possible memory locations accessible by // x.as_strided(sizes, strides, maybe_storage_offset) // are within the bounds of possible memory locations accessible by x. -static void checkBasicAsStridedValidForSlice( +void checkBasicAsStridedValidForSlice( const Tensor& physical_tensor, int64_t num_batch_dims, c10::SymIntArrayRef sizes, diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp index ecedc729ccd7..667e92970033 100644 --- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp +++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optiona return output; } -static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { +inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { if (reduction == at::Reduction::Mean) { return unreduced.mean(); } else if (reduction == at::Reduction::Sum) { @@ -108,9 +109,7 @@ Tensor binary_cross_entropy_with_logits_hack( } Tensor trace_backward_decomp(const Tensor& grad, IntArrayRef sizes) { - if (sizes.size() != 2) { - throw std::runtime_error("expected matrix input"); - } + TORCH_CHECK(sizes.size() == 2, "expected matrix input"); auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options()); auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong)); // Workaround using index_put instead of yet unsupported index_fill_ @@ -128,7 +127,7 @@ namespace { template using Ctype = std::conditional_t; -static Tensor make_feature_noise(const Tensor& input) { +Tensor make_feature_noise(const Tensor& input) { auto input_sizes = input.sizes(); TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input"); std::vector sizes; @@ -142,7 +141,7 @@ static Tensor make_feature_noise(const Tensor& input) { return at::empty(sizes, input.options()); } -static bool is_fused_kernel_acceptable(const Tensor& input, double p) { +bool is_fused_kernel_acceptable(const Tensor& input, double p) { return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0; } @@ -211,7 +210,7 @@ ALIAS_SPECIALIZATION(_feature_dropout, true, false) ALIAS_SPECIALIZATION(_alpha_dropout, false, true ) ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) -static Tensor dropout(const Tensor& input, double p, bool train) { +Tensor dropout(const Tensor& input, double p, bool train) { auto result = [&]() { NoNamesGuard guard; if (train && is_fused_kernel_acceptable(input, p)) { diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index f4316def4fb4..cfdecaac778b 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -90,6 +90,10 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo allocator_->setMemoryFraction(fraction, device); } + std::vector getExpandableSegmentSizes(c10::DeviceIndex device) override { + return allocator_->getExpandableSegmentSizes(device); + } + void enable(bool value) override { allocator_->enable(value); } diff --git a/aten/src/ATen/metal/Context.h b/aten/src/ATen/metal/Context.h index 1f977cf50d9e..e4c6da738e0d 100644 --- a/aten/src/ATen/metal/Context.h +++ b/aten/src/ATen/metal/Context.h @@ -18,7 +18,7 @@ extern std::atomic g_metal_impl_registry; class MetalImplRegistrar { public: - explicit MetalImplRegistrar(MetalInterface*); + explicit MetalImplRegistrar(MetalInterface* /*impl*/); }; at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src); diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index d858df073397..6c58de099648 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -12,7 +12,7 @@ #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" #define MPS_ERROR_RUNTIME_TOO_LOW \ - "The MPS backend is supported on MacOS 13.0+.", \ + "The MPS backend is supported on MacOS 14.0+. ", \ "Current OS version can be queried using `sw_vers`" #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \ "as the MPS framework doesn't support float64. Please use float32 instead." diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index a2ec221c1bfe..34fbd31af91d 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -70,7 +70,10 @@ } void* MPSHooks::getCommandBuffer() const { - return at::mps::getDefaultMPSStream()->commandBuffer(); + auto stream = at::mps::getDefaultMPSStream(); + // Release pending computeCommandEncoder, as extensions is likely to allocate new one + stream->endKernelCoalescing(); + return stream->commandBuffer(); } void* MPSHooks::getDispatchQueue() const { diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index e9627a343ad6..71325bd69e1d 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -158,7 +158,18 @@ @interface MPSGraphExecutionDescriptor () endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; - [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value]; + // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26 + // See https://github.com/pytorch/pytorch/issues/163962 + // Workaround by batching copy commands into 4Gb chunks + constexpr size_t max_copy_size = 0x100000000; // 4GB + size_t bytes_filled = 0; + size_t bytes_remains = length; + while (bytes_remains > 0) { + NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains); + [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value]; + bytes_filled += bytes_to_copy; + bytes_remains -= bytes_to_copy; + } [blitEncoder endEncoding]; synchronize(syncType); } diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index db11422f2d83..c164120a1f3c 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) ( namespace at::native { -static const double SELU_ALPHA = 1.6732632423543772848170429916717; -static const double SELU_SCALE = 1.0507009873554804934193349852946; +static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717; +static constexpr double SELU_SCALE = 1.0507009873554804934193349852946; DEFINE_DISPATCH(elu_stub); DEFINE_DISPATCH(elu_backward_stub); @@ -670,6 +670,8 @@ Tensor rrelu_with_noise_backward( } Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional generator) { + TORCH_CHECK(std::isfinite(lower.to()), "rrelu: lower bound must be finite, got ", lower.to()); + TORCH_CHECK(std::isfinite(upper.to()), "rrelu: upper bound must be finite, got ", upper.to()); TORCH_CHECK(lower.to() <= upper.to(), "Lower bound should be less than or equal to the upper bound") auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator)); diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp index e744c2b5e0e7..5821cd561cdf 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -24,7 +24,7 @@ namespace at::native { namespace { template -static void adaptive_avg_pool3d_out_frame( +void adaptive_avg_pool3d_out_frame( const scalar_t* input_p, scalar_t* output_p, int64_t sizeD, @@ -176,7 +176,7 @@ void adaptive_avg_pool3d_out_cpu_template( } template -static void adaptive_avg_pool3d_backward_out_frame( +void adaptive_avg_pool3d_backward_out_frame( scalar_t* gradInput_p, const scalar_t* gradOutput_p, int64_t sizeD, diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp index 46dc5623b595..ef4bab3ec1de 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp @@ -93,7 +93,7 @@ namespace { // 5d tensor B x D x T x H x W template -static void adaptive_max_pool3d_single_out_frame( +void adaptive_max_pool3d_single_out_frame( const scalar_t *input_p, scalar_t *output_p, int64_t *ind_p, @@ -170,7 +170,7 @@ static void adaptive_max_pool3d_single_out_frame( } template -static void adaptive_max_pool3d_out_frame( +void adaptive_max_pool3d_out_frame( const scalar_t *input_data, scalar_t *output_data, int64_t *indices_data, @@ -202,7 +202,7 @@ static void adaptive_max_pool3d_out_frame( } template -static void adaptive_max_pool3d_backward_single_out_frame( +void adaptive_max_pool3d_backward_single_out_frame( scalar_t *gradInput_p, const scalar_t *gradOutput_p, const int64_t *ind_p, @@ -241,7 +241,7 @@ static void adaptive_max_pool3d_backward_single_out_frame( } template -static void adaptive_max_pool3d_backward_out_frame( +void adaptive_max_pool3d_backward_out_frame( scalar_t *gradInput_data, const scalar_t *gradOutput_data, const int64_t *indices_data, diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp index 8a588b7cac11..365cfa311512 100644 --- a/aten/src/ATen/native/AveragePool3d.cpp +++ b/aten/src/ATen/native/AveragePool3d.cpp @@ -153,7 +153,7 @@ namespace at::native { namespace { template -static void avg_pool3d_out_frame( +void avg_pool3d_out_frame( const scalar_t *input_p, scalar_t *output_p, int64_t nslices, @@ -333,7 +333,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( namespace { template -static void avg_pool3d_backward_out_frame( +void avg_pool3d_backward_out_frame( scalar_t *gradInput_p, const scalar_t *gradOutput_p, int64_t nslices, diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index d323e54a95ab..6669357cda45 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -2060,7 +2060,7 @@ std::tuple linalg_lu_factor(const Tensor& A, bool pivot) { } // TODO Deprecate this function in favour of linalg_lu_factor_ex -std::tuple _lu_with_info(const Tensor& self, bool compute_pivots, bool) { +std::tuple _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) { TORCH_WARN_ONCE( "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ", "removed in a future PyTorch release.\n", diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index 54fb610722d6..df64aa42e602 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -143,13 +143,13 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper) For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776 */ template -static inline +inline std::enable_if_t, int> lapack_work_to_int(const T val) { const auto next_after = std::nextafter(val, std::numeric_limits::infinity()); return std::max(1, std::ceil(next_after)); } template -static inline +inline std::enable_if_t::value, int> lapack_work_to_int(const T val) { return lapack_work_to_int(val.real()); } @@ -343,7 +343,7 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c For further details, please see the LAPACK documentation for GEQRF. */ template -static void apply_geqrf(const Tensor& input, const Tensor& tau) { +void apply_geqrf(const Tensor& input, const Tensor& tau) { #if !AT_BUILD_WITH_LAPACK() TORCH_CHECK( false, @@ -1039,7 +1039,7 @@ void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tr } template -static void apply_svd(const Tensor& A, +void apply_svd(const Tensor& A, const bool full_matrices, const bool compute_uv, const Tensor& U, diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 49366151ae60..6b7496f49732 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, template scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy); -static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) { +static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) { return n == 1 || lda >= std::max(1L, m); } diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp index 5f3976bd18d6..b476ca3cff8f 100644 --- a/aten/src/ATen/native/BlasKernel.cpp +++ b/aten/src/ATen/native/BlasKernel.cpp @@ -286,7 +286,7 @@ template void scal_fast_path(int *n, scalar_t *a, scalar_t *x, int *in #if AT_BUILD_WITH_BLAS() template <> bool scal_use_fast_path(int64_t n, int64_t incx) { - auto intmax = std::numeric_limits::max(); + auto constexpr intmax = std::numeric_limits::max(); return n <= intmax && incx <= intmax; } @@ -315,7 +315,7 @@ bool gemv_use_fast_path( int64_t incx, [[maybe_unused]] float beta, int64_t incy) { - auto intmax = std::numeric_limits::max(); + auto constexpr intmax = std::numeric_limits::max(); return (m <= intmax) && (n <= intmax) && (lda <= intmax) && (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax); } @@ -375,7 +375,7 @@ static void bf16_gemv_trans( const at::BFloat16 beta, at::BFloat16* y, const int incy) { - return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy); + bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy); } template <> diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h index 70878ecd704d..bd19f9c987f1 100644 --- a/aten/src/ATen/native/BucketizationUtils.h +++ b/aten/src/ATen/native/BucketizationUtils.h @@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors( const Tensor& raw_boundaries) { Tensor trimmed_sorter; Tensor raw_sorter; - return searchsorted_maybe_trim_input_tensors( + searchsorted_maybe_trim_input_tensors( trimmed_input, trimmed_boundaries, trimmed_sorter, diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index 20be0d6fe017..c17a70ea308a 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher::operator()(const PackKey& key) cons template struct KernelCache { using kstore_t = std::unordered_map, UnsafeUkernelKeyHasher>; - static inline std::shared_ptr&& fetch_or_create( + static std::shared_ptr&& fetch_or_create( const key_t& key, const std::function()>& callback) { auto&& search = get_store().find(key); @@ -1003,7 +1003,7 @@ struct KernelCache { } } - static inline kstore_t& get_store() { + static kstore_t& get_store() { static thread_local kstore_t cache_kernels; return cache_kernels; } @@ -1067,7 +1067,7 @@ struct GemmHelper { struct Brgemm : public KernelCache { // Fetch/create GemmHelper object and execute brgemm with batch size = 1 template - static inline void call( + static void call( int64_t M, int64_t N, int64_t K, @@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache { .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data()); } - static inline std::shared_ptr& get_current() { + static std::shared_ptr& get_current() { static thread_local std::shared_ptr current; return current; } - static inline bool device_check(ScalarType dtype) { + static bool device_check(ScalarType dtype) { if (!at::globalContext().userEnabledMkldnn()) { return false; } @@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B; using pack_t = dnnl::ukernel::transform; #endif struct Pack : public KernelCache { - static inline void call( + static void call( int64_t K, int64_t N, int64_t ld_in, @@ -1182,7 +1182,7 @@ struct Pack : public KernelCache { } } - static inline bool could_pack(ScalarType dtype) { + static bool could_pack(ScalarType dtype) { if (!at::globalContext().userEnabledMkldnn()) { return false; } diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp index 51e005c2901b..f0270a02b267 100644 --- a/aten/src/ATen/native/Col2Im.cpp +++ b/aten/src/ATen/native/Col2Im.cpp @@ -71,7 +71,7 @@ namespace at::native { namespace { -static void col2im_out_cpu_template( +void col2im_out_cpu_template( Tensor& output, const Tensor& input_, IntArrayRef output_size, diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index e160c84ced33..892144ac663a 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -465,8 +465,11 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor return false; } - auto fmt = input.suggest_memory_format(); - return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; + auto is_channel_last = [](const at::Tensor& t) { + auto fmt = t.suggest_memory_format(); + return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; + }; + return is_channel_last(input) || is_channel_last(weight); } } // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index ab427f396e34..1158359be239 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -32,10 +32,6 @@ #include #endif -#ifdef USE_MPS -#include -#endif - #ifndef AT_PER_OPERATOR_HEADERS #include #include @@ -410,11 +406,23 @@ struct ConvParams { // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) #if !defined(C10_MOBILE) - if (!detail::getCUDAHooks().compiledWithCuDNN()) { + if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { return false; } + static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); + // broken on cuDNN 9.8 + if (cudnn_version >= 90800) { + if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous && + (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) && + weight.dim() == 5) { + for (int i = 2; i < weight.dim(); i++) { + if (weight.size(i) != 1) { + return false; + } + } + } + } if (needs_64bit_indexing_no_split(input, weight)) { - static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -422,9 +430,6 @@ struct ConvParams { return false; } } - if (!input.is_cuda() || !cudnn_enabled) { - return false; - } if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { return false; @@ -443,16 +448,19 @@ struct ConvParams { // Use cudnn for FP16 depthwise convolutions bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const { - if (!detail::getCUDAHooks().compiledWithCuDNN()) { + if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) { return false; } - if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) { - // always use cudnn_depthwise for channels_last format - return true; - } // native kernel doesn't support 64-bit non-splittable case - if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { + if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; + // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x + if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { + if (cudnn_version < 0 || cudnn_version > 91000) { + return false; + } + } + if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -462,6 +470,10 @@ struct ConvParams { return true; } } + if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { + // always use cudnn_depthwise for channels_last format + return true; + } if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { bool kernel_cond = (use_cudnn(input, weight) && input.scalar_type() == kHalf && // only for FP16 @@ -646,6 +658,7 @@ static void check_shape_forward(const at::Tensor& input, TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported"); TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported"); TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero"); + TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups); TORCH_CHECK(weight_dim == k, "Expected ", weight_dim, "-dimensional input for ", weight_dim, @@ -690,7 +703,7 @@ static void check_shape_forward(const at::Tensor& input, // If kernel size is incorrect std::ostringstream input_ss; std::ostringstream kernel_ss; - std::string separator = ""; + std::string separator; for (int i = 0, len = input_shape.size(); i < len; ++i) { input_ss << separator << input_shape[i]; @@ -1007,7 +1020,7 @@ static Tensor convolution_same( if (symmetric_padding) { // All backends handle symmetric padding natively - SymDimVector output_padding(static_cast(dim)); + SymDimVector output_padding(dim); return at::convolution_symint(input, weight, bias, stride, padding_l, dilation, false, output_padding, groups); } @@ -1027,7 +1040,7 @@ static Tensor convolution_same( } } auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0); - SymDimVector output_padding(static_cast(dim)); + SymDimVector output_padding(dim); return at::convolution_symint(padded_input, weight, bias, stride, padding_l, dilation, false, output_padding, groups); } @@ -1162,7 +1175,7 @@ at::Tensor convolution( bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); return at::_convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, - ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv")); + ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV)); } at::Tensor convolution_overrideable( @@ -1307,7 +1320,7 @@ ConvBackend select_conv_backend( params.benchmark = ctx.benchmarkCuDNN(); params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); params.cudnn_enabled = ctx.userEnabledCuDNN(); - params.allow_tf32 = ctx.allowTF32CuDNN("conv"); + params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); auto input = input_r; auto weight = weight_r; @@ -1429,12 +1442,8 @@ static inline at::MemoryFormat determine_backend_memory_format( } break; case ConvBackend::Mps: + case ConvBackend::MpsTranspose: if (mps_conv_use_channels_last(input, weight)) { -#ifdef USE_MPS - if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) { - break; - } -#endif backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast; } break; @@ -1691,7 +1700,7 @@ at::Tensor _convolution( c10::MaybeOwned bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt); const Tensor& bias_r = *bias_r_maybe_owned; - return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv")); + return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV)); } std::tuple convolution_backward_overrideable( @@ -1989,7 +1998,7 @@ std::tuple convolution_backward( params.benchmark = ctx.benchmarkCuDNN(); params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms(); params.cudnn_enabled = ctx.userEnabledCuDNN(); - params.allow_tf32 = ctx.allowTF32CuDNN("conv"); + params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV); // Validate inputs. check_shape_backward(input, weight.sizes(), params); diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 619542c29ef5..538a893d54ea 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -25,7 +25,7 @@ namespace at::native { namespace { -static Tensor compute_columns2d( +Tensor compute_columns2d( const Tensor& input, IntArrayRef padding, IntArrayRef stride, @@ -93,7 +93,7 @@ static Tensor compute_columns2d( return columns.contiguous(); } -static inline void slow_conv2d_shape_check( +inline void slow_conv2d_shape_check( const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -205,7 +205,7 @@ static inline void slow_conv2d_shape_check( } } -static inline Tensor view_weight_2d(const Tensor& weight_, +inline Tensor view_weight_2d(const Tensor& weight_, at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) { Tensor weight = weight_.contiguous(memory_format); if (weight.dim() == 4) { @@ -220,7 +220,7 @@ static inline Tensor view_weight_2d(const Tensor& weight_, } template -static void slow_conv2d_update_output_frame( +void slow_conv2d_update_output_frame( TensorAccessor input, TensorAccessor output, TensorAccessor weight, @@ -480,7 +480,7 @@ void slow_conv2d_backward_weight_frame( } } -static void slow_conv2d_backward_weight_out_cpu_template( +void slow_conv2d_backward_weight_out_cpu_template( Tensor& grad_weight, const Tensor& input, const Tensor& grad_output_, diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index f361b3a81912..894bf29456f7 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -27,7 +28,7 @@ namespace at::native { namespace { -static Tensor compute_columns3d( +Tensor compute_columns3d( const Tensor& input_, IntArrayRef stride, IntArrayRef padding, @@ -107,7 +108,7 @@ static Tensor compute_columns3d( return columns; } -static inline void slow_conv3d_shape_check( +inline void slow_conv3d_shape_check( const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -174,6 +175,23 @@ static inline void slow_conv3d_shape_check( const int64_t input_height = input.size(dim_height); const int64_t input_width = input.size(dim_width); + constexpr int64_t MAX_SAFE_PAD = (1LL << 61); + + TORCH_CHECK_VALUE( + pad_height <= MAX_SAFE_PAD, + "Padding height too large: pad_height=", + pad_height); + + TORCH_CHECK_VALUE( + pad_width <= MAX_SAFE_PAD, + "Padding width too large: pad_width=", + pad_width); + + TORCH_CHECK_VALUE( + pad_depth <= MAX_SAFE_PAD, + "Padding depth too large: pad_depth=", + pad_depth); + const int64_t exact_input_depth = input_depth + 2 * pad_depth; const int64_t exact_input_height = input_height + 2 * pad_height; const int64_t exact_input_width = input_width + 2 * pad_width; @@ -221,6 +239,14 @@ static inline void slow_conv3d_shape_check( output_width, "). Output size is too small"); + uint64_t kernel_product; + TORCH_CHECK( + !c10::mul_overflows(kernel_height, kernel_width, &kernel_product), + "Kernel height x width product is too large: kernel_height=", + kernel_height, + ", kernel_width=", + kernel_width); + if (weight.defined()) { int64_t n_input_plane = weight.size(1); if (weight.dim() == 2) { @@ -247,7 +273,7 @@ static inline void slow_conv3d_shape_check( } } -static Tensor view_weight_2d(const Tensor& weight_) { +Tensor view_weight_2d(const Tensor& weight_) { Tensor weight = weight_.contiguous(); if (weight.dim() == 5) { const int64_t s1 = weight.size(0); @@ -260,7 +286,7 @@ static Tensor view_weight_2d(const Tensor& weight_) { } template -static void slow_conv3d_update_output_frame( +void slow_conv3d_update_output_frame( TensorAccessor input, TensorAccessor output, TensorAccessor weight, @@ -489,7 +515,7 @@ void slow_conv3d_backward_weight_frame( grad_weight.data(), ldc, grad_weight.stride(0) * n); } -static void slow_conv3d_backward_parameters_out_cpu_template( +void slow_conv3d_backward_parameters_out_cpu_template( Tensor& grad_weight, const Tensor& input, const Tensor& grad_output, diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 3d388194ea49..0b3ffda30577 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -1,6 +1,5 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include #include #include diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index 1c9db44aebb0..755fe00b1f1c 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -127,7 +128,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { - const static scalar_t kTailValues[] = { + constexpr static scalar_t kTailValues[] = { 0.0810614667953272, 0.0413406959554092, 0.0276779256849983, @@ -139,7 +140,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { 0.00925546218271273, 0.00833056343336287 }; - if (k <= 9) { + if (k < std::size(kTailValues)) { return kTailValues[static_cast(k)]; } scalar_t kp1sq = (k + 1) * (k + 1); diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 150970edc507..e1076d0400f7 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -108,7 +108,7 @@ bool is_fast_path(const Tensor& src, const std::optional& scale, Tensor& // index_add (using add_indices as the index), without creating an intermediary // tensor to hold the selected embeddings template -static std::enable_if_t, void> +std::enable_if_t, void> index_select_add( const Tensor& select_indices, const Tensor& add_indices, @@ -494,7 +494,7 @@ index_select_add(const Tensor &select_indices, // mul (scaling by per_sample_weights) // index_add (using add_indices as the index) template -static std::enable_if_t, void> +std::enable_if_t, void> index_select_scale_add( const Tensor& select_indices, const Tensor& add_indices, diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp index 5ff1e6b61ed2..8e04a7490e87 100644 --- a/aten/src/ATen/native/Fill.cpp +++ b/aten/src/ATen/native/Fill.cpp @@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { int64_t nDims = self.dim(); TORCH_CHECK(nDims >= 2, "dimensions must larger than 1"); - int64_t height = self.size(0); - int64_t width = self.size(1); + auto height = self.sym_size(0); + auto width = self.sym_size(1); if (nDims > 2) { - int64_t dim1 = height; for (const auto i : c10::irange(1, nDims)) { - if (self.size(i) != dim1) { + if (self.sym_size(i) != height) { TORCH_CHECK(false, "all dimensions of input must be of equal length"); } } } - int64_t storage_offset = self.storage_offset(); - std::vector sizes; - std::vector strides; - int64_t size = std::min(height, width); + auto storage_offset = self.sym_storage_offset(); + auto size = std::min(height, width); int64_t stride = 0; for (const auto i : c10::irange(nDims)) { stride += self.stride(i); } - strides.push_back(stride); - sizes.push_back(size); + std::vector strides{stride}; + std::vector sizes{size}; - auto main_diag = self.as_strided(sizes, strides, storage_offset); + auto main_diag = self.as_strided_symint(sizes, strides, storage_offset); main_diag.fill_(fill_value); if (wrap && nDims == 2 && height > width + 1) { - std::vector wrap_sizes; + auto step = width + 1; + auto wrap_size = ((self.numel() + step - 1) / step) - size; + std::vector wrap_sizes{wrap_size}; - int64_t step = width + 1; - int64_t wrap_size = ((self.numel() + step - 1) / step) - size; - wrap_sizes.push_back(wrap_size); + auto offset = self.stride(0) * (width + 1); - int64_t offset = self.stride(0) * (width + 1); - - auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset); + auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset); wrap_diag.fill_(fill_value); } diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index 059d27b39546..664a612d0b13 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -130,7 +130,7 @@ namespace native { namespace { template -static void fractional_max_pool2d_out_single_batch_frame( +void fractional_max_pool2d_out_single_batch_frame( const scalar_t* input, scalar_t* output, int64_t* indices, @@ -188,7 +188,7 @@ static void fractional_max_pool2d_out_single_batch_frame( } template -static void fractional_max_pool2d_out_frame( +void fractional_max_pool2d_out_frame( const scalar_t* input, scalar_t* output, int64_t* indices, @@ -220,7 +220,7 @@ static void fractional_max_pool2d_out_frame( } template -static void fractional_max_pool2d_backward_out_single_batch_frame( +void fractional_max_pool2d_backward_out_single_batch_frame( scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, @@ -247,7 +247,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame( } template -static void fractional_max_pool2d_backward_out_frame( +void fractional_max_pool2d_backward_out_frame( scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index 68328018b24b..5ed3fdeab765 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -99,7 +99,7 @@ namespace at::native { namespace { template -static void fractional_max_pool3d_out_single_batch_frame( +void fractional_max_pool3d_out_single_batch_frame( const scalar_t* input, scalar_t* output, int64_t* indices, @@ -169,7 +169,7 @@ static void fractional_max_pool3d_out_single_batch_frame( } template -static void fractional_max_pool3d_out_frame( +void fractional_max_pool3d_out_frame( const scalar_t* input, scalar_t* output, int64_t* indices, @@ -257,7 +257,7 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)( namespace { template -static void fractional_max_pool3d_backward_out_single_batch_frame( +void fractional_max_pool3d_backward_out_single_batch_frame( scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, @@ -287,7 +287,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame( } template -static void fractional_max_pool3d_backward_out_frame( +void fractional_max_pool3d_backward_out_frame( scalar_t* gradInput, const scalar_t* gradOutput, const int64_t* indices, diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h index f783043c7961..3388af7b8a0a 100644 --- a/aten/src/ATen/native/GridSamplerUtils.h +++ b/aten/src/ATen/native/GridSamplerUtils.h @@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler( const TensorBase& input, const TensorBase& grid ) { + auto st = input.scalar_type(); + if (!(st == kDouble || st == kFloat || st == kHalf)) + return false; + st = grid.scalar_type(); + if (!(st == kDouble || st == kFloat || st == kHalf)) + return false; return ( at::native::cudnn_is_acceptable(input) && at::native::cudnn_is_acceptable(grid) && diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp index 9954edef9460..5919997cf5fe 100644 --- a/aten/src/ATen/native/Histogram.cpp +++ b/aten/src/ATen/native/Histogram.cpp @@ -23,6 +23,7 @@ #include #endif +#include #include #include #include @@ -202,6 +203,46 @@ select_outer_bin_edges(const Tensor& input, std::optional> return std::make_pair(leftmost_edges, rightmost_edges); } + +/* Bin edges correction based on the precision representation. + * To maintain the backward compatibility we take max(std::nextafter<>, +1) + * and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual. + */ +void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge) +{ +#define UPDATE_WITH_LIMIT(real_type, scalartype) \ + case ScalarType::scalartype: \ + leftmost_edge = std::min( \ + static_cast( \ + std::nexttoward( \ + static_cast(leftmost_edge), \ + std::numeric_limits::lowest() \ + ) \ + ), \ + leftmost_edge - 1. \ + ); \ + rightmost_edge = std::max( \ + static_cast( \ + std::nexttoward( \ + static_cast(rightmost_edge), \ + std::numeric_limits::max() \ + ) \ + ), \ + rightmost_edge + 1. \ + ); \ + break; + + switch (t) { + UPDATE_WITH_LIMIT(double, Double) + UPDATE_WITH_LIMIT(float, Float) + default: + // Fallback to the default behavior for other types + leftmost_edge -= 1; + rightmost_edge += 1; + } +#undef UPDATE_WITH_LIMIT +} + /* histc's version of the logic for outermost bin edges. */ std::pair histc_select_outer_bin_edges(const Tensor& input, @@ -216,8 +257,7 @@ std::pair histc_select_outer_bin_edges(const Tensor& input, } if (leftmost_edge == rightmost_edge) { - leftmost_edge -= 1; - rightmost_edge += 1; + bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge); } TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) || diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index 25eb4d678724..acdcb2b27bda 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -19,7 +19,7 @@ namespace at::native { namespace { -static void im2col_out_cpu_template( +void im2col_out_cpu_template( Tensor& output, const Tensor& input_, IntArrayRef kernel_size, diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 616e6ec60e13..7b5ec83e1698 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -2801,6 +2801,7 @@ Tensor matrix_exp(const Tensor& a) { // TODO This should be deprecated in favor of linalg_matrix_exp_differential // in FunctionsManual.cpp Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) { + squareCheckInputs(self, "matrix_exp_backward"); NoTF32Guard disable_tf32; return backward_analytic_function_of_a_matrix( self, grad, diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 265bc112adcc..40d79d97c0cd 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -61,7 +61,7 @@ constexpr float EPSILON = 1e-12; namespace { - static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { + inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) { if (reduction == at::Reduction::Mean) { return unreduced.mean(); } else if (reduction == at::Reduction::Sum) { diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index 46b9397a008c..2e2bc5542b51 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -44,7 +44,7 @@ namespace { // this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done template -static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { +inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) { if (idx % 2 == 0) { return BLANK; } else { diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp index a3ec774a0a46..b524d277cd0a 100644 --- a/aten/src/ATen/native/LossMultiLabelMargin.cpp +++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp @@ -58,7 +58,7 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu( } template -static void multilabel_margin_loss_forward_out_frame( +void multilabel_margin_loss_forward_out_frame( const Tensor& input_contiguous, const Tensor& target_contiguous, Tensor& output, @@ -108,7 +108,7 @@ static void multilabel_margin_loss_forward_out_frame( } } -static void multilabel_margin_loss_forward_out_cpu_template( +void multilabel_margin_loss_forward_out_cpu_template( const Tensor& input, const Tensor& target, Tensor& output, @@ -153,7 +153,7 @@ static void multilabel_margin_loss_forward_out_cpu_template( } template -static void multilabel_margin_loss_backward_out_frame( +void multilabel_margin_loss_backward_out_frame( Tensor& grad_input, const Tensor& grad_output, const Tensor& input_contiguous, @@ -222,7 +222,7 @@ static void multilabel_margin_loss_backward_out_frame( } } -static void multilabel_margin_loss_backward_out_cpu_template( +void multilabel_margin_loss_backward_out_cpu_template( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index f003cfcf2c5a..f9dc074a6983 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -57,7 +57,7 @@ inline int64_t target_index_checked( } template -static inline void multi_margin_loss_cpu_kernel( +inline void multi_margin_loss_cpu_kernel( Tensor& output, const scalar_t* input_data, const int64_t* target_data, @@ -148,7 +148,7 @@ void multi_margin_loss_out_cpu_template( } template -static void multi_margin_loss_backward_cpu_kernel( +void multi_margin_loss_backward_cpu_kernel( scalar_t* grad_input_data, const Tensor& grad_output, const scalar_t* input_data, diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index ca86292403fb..576f56986988 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -159,7 +159,7 @@ inline scalar_t* optional_data(const Tensor& source) { } template -static void nll_loss_out_frame( +void nll_loss_out_frame( const Tensor& output, const Tensor& total_weight, const Tensor& input, @@ -338,7 +338,7 @@ void nll_loss_forward_out_cpu_template( } template -static void nll_loss_backward_out_frame( +void nll_loss_backward_out_frame( const Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 4ce394ec2f56..7bea90cbd527 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -99,7 +99,7 @@ inline void check_gradout_shape_nll_loss2d( template -static void nll_loss2d_forward_out_frame( +void nll_loss2d_forward_out_frame( Tensor& output, Tensor& total_weight, const Tensor& input, @@ -280,7 +280,7 @@ void nll_loss2d_forward_out_cpu_template( } template -static void nll_loss2d_backward_out_frame( +void nll_loss2d_backward_out_frame( Tensor& grad_input, const Tensor& grad_output, const Tensor& input, diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index b261da5fe54e..4677542706f6 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M, template static scalar_t lanczos_sum_expg_scaled(scalar_t x) { // lanczos approximation - static const scalar_t lanczos_sum_expg_scaled_num[13] = { + static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = { 0.006061842346248906525783753964555936883222, 0.5098416655656676188125178644804694509993, 19.51992788247617482847860966235652136208, @@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) { 103794043.1163445451906271053616070238554, 56906521.91347156388090791033559122686859 }; - static const scalar_t lanczos_sum_expg_scaled_denom[13] = { + static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = { 1., 66., 1925., @@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { template static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] - static const scalar_t d[25][25] = + static constexpr scalar_t d[25][25] = {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp index a71db5e8ef8d..f91b892efec2 100644 --- a/aten/src/ATen/native/MaxUnpooling.cpp +++ b/aten/src/ATen/native/MaxUnpooling.cpp @@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu( // Nondeterministic with duplicate indices at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out"); - auto oheight = output_size[0]; - auto owidth = output_size[1]; TORCH_CHECK( indices_.scalar_type() == at::ScalarType::Long, "elements in indices should be type int64 but got: ", indices_.scalar_type()); @@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu( self_.sizes(), " with dimension ", i , " being empty."); } + auto oheight = output_size[0]; + auto owidth = output_size[1]; + auto memory_format = self_.suggest_memory_format(); auto self = self_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp index 799b5ffa2cdb..08c42a0d470c 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp @@ -24,7 +24,7 @@ namespace at { namespace { -static inline void slow_conv_transpose2d_shape_check( +inline void slow_conv_transpose2d_shape_check( const Tensor& input, const Tensor& grad_output, const Tensor& weight, @@ -386,7 +386,7 @@ void slow_conv_transpose2d_out_cpu_template( } } -static void slow_conv_transpose2d_backward_out_cpu_template( +void slow_conv_transpose2d_backward_out_cpu_template( const Tensor& input_, const Tensor& grad_output_, Tensor& grad_input, diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index f69e84521e5d..469269ab07df 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -22,7 +22,7 @@ namespace at::native { namespace { -static inline void slow_conv_transpose3d_shape_check( +inline void slow_conv_transpose3d_shape_check( const Tensor& input, const Tensor& grad_output, const Tensor& weight, diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 13b421d1e688..72526162d133 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -62,7 +62,7 @@ #include #include -static const int MIOPEN_DIM_MAX = 5; +static constexpr int MIOPEN_DIM_MAX = 5; namespace at::meta { @@ -92,7 +92,7 @@ namespace { arg_name, " should contain ", expected, " elements not ", actual); } - static inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) { + inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) { if (t.defined()) { return t.repeat_symint(repeat); } diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index 8833bdb6e471..2a20f95f10c2 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -34,16 +34,16 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } } - auto shape = self.sizes().vec(); + auto shape = self.sym_sizes().vec(); // empty tensor could be converted to one hot representation, // but shape inference is not possible. - if (self.numel() == 0) { + if (self.sym_numel() == 0) { if (num_classes <= 0) { TORCH_CHECK(false, "Can not infer total number of classes from empty tensor."); } else { - shape.push_back(num_classes); - return at::empty(shape, self.options()); + shape.emplace_back(num_classes); + return at::empty_symint(shape, self.options()); } } @@ -66,8 +66,8 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } } - shape.push_back(num_classes); - Tensor ret = at::zeros(shape, self.options()); + shape.emplace_back(num_classes); + Tensor ret = at::zeros_symint(shape, self.options()); ret.scatter_(-1, self.unsqueeze(-1), 1); return ret; } diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index 8099648d37b2..986447bab614 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -70,10 +70,10 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) new_shape.emplace_back(input_sizes[i]); } - for (const auto i : c10::irange((size_t)l_pad)) { + for (const auto i : c10::irange(l_pad)) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; - TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", + TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, " "which is invalid. Check dimension ", l_diff + i, " of your input."); new_shape.emplace_back(new_dim); diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index f3858ac3d365..75b30320b027 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) { return false; } +bool use_cudnn(const Tensor& t) { + bool acceptable = at::cudnn_is_acceptable(t); + auto st = t.scalar_type(); + bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN(); + return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf); +} + template using pair_of = std::pair; @@ -531,7 +538,7 @@ c10::intrusive_ptr make_quantized_cell_params_fp16( std::move(w_ih_packed), std::move(w_hh_packed)); } -static std::unordered_map< +std::unordered_map< std::string, c10::intrusive_ptr (*)(CellParamsSerializationType)> cell_params_deserializers = { @@ -571,7 +578,7 @@ struct QRNNCellParamsWrapper { // Gathers every two elements of a vector in a vector of pairs template -static std::vector> pair_vec(const std::vector& vals) { +std::vector> pair_vec(const std::vector& vals) { TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN"); std::vector> result; result.reserve(vals.size() / 2); @@ -583,7 +590,7 @@ static std::vector> pair_vec(const std::vector& vals) { // Flattens a vector of pairs template -static std::vector unpair_vec(std::vector>&& vals) { +std::vector unpair_vec(std::vector>&& vals) { std::vector result; result.reserve(vals.size() * 2); for (const auto i : c10::irange(vals.size())) { @@ -594,7 +601,7 @@ static std::vector unpair_vec(std::vector>&& vals) { } // Parses a flat list of parameter tensors into a list of CellParams -static std::vector gather_params(TensorList params, bool has_biases, bool has_projections = false) { +std::vector gather_params(TensorList params, bool has_biases, bool has_projections = false) { static at::Tensor undefined; std::vector result; if (has_biases) { @@ -1200,7 +1207,7 @@ std::tuple _thnn_fused_lstm_cell_backwar bool train, \ bool bidirectional, \ bool batch_first) { \ - if (at::cudnn_is_acceptable(_input)) { \ + if (use_cudnn(_input)) { \ Tensor output, hy; \ NAME##_cudnn_stub( \ _input.device().type(), \ @@ -1262,7 +1269,7 @@ std::tuple _thnn_fused_lstm_cell_backwar double dropout_p, \ bool train, \ bool bidirectional) { \ - if (at::cudnn_is_acceptable(data)) { \ + if (use_cudnn(data)) { \ Tensor output, hy; \ NAME##_packed_cudnn_stub( \ data.device().type(), \ @@ -1430,7 +1437,7 @@ std::tuple lstm( TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); - if (at::cudnn_is_acceptable(_input)) { + if (use_cudnn(_input)) { Tensor output, hy, cy; lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases, num_layers, dropout_p, train, bidirectional, batch_first); @@ -1491,7 +1498,7 @@ std::tuple lstm( TensorList _params, bool has_biases, int64_t num_layers, double dropout_p, bool train, bool bidirectional) { TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states"); - if (at::cudnn_is_acceptable(data)) { + if (use_cudnn(data)) { Tensor output, hy, cy; lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx, _params, has_biases, num_layers, dropout_p, train, bidirectional); @@ -1887,10 +1894,10 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple namespace { -[[maybe_unused]] static auto ensure_linear_params_registered = +[[maybe_unused]] auto ensure_linear_params_registered = register_linear_params(); -static auto cell_params_base_registry = +auto cell_params_base_registry = torch::selective_class_("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase")) .def_pickle( [](const c10::intrusive_ptr& self) diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h index dcab86ca9a42..fd62b8e01329 100644 --- a/aten/src/ATen/native/RangeUtils.h +++ b/aten/src/ATen/native/RangeUtils.h @@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar int64_t sgn = (xstep > 0) - (xstep < 0); size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); } else { - size_d = std::ceil(static_cast(end.to() - start.to()) + size_d = std::ceil((end.to() - start.to()) / step.to()); } diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index daf153e460e9..a946def225b0 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -107,11 +107,6 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { storage->set_nbytes(size_bytes); } -// Call the sparse implementation in SparseTensor.cpp directly. -// A dynamic dispatch here is NOT necessary, so I didn't put -// this function in native_functions.yaml -const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src); - // TODO(VitalyFedyunin): Move it to HTML docs. // // Strides of the output tensor of `resize_as_` operator is defined by input diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 0053b86c3373..39e203f63278 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -15,7 +15,11 @@ namespace at::native { Scalar item(const Tensor& self) { auto numel = self.sym_numel(); - TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar"); + TORCH_SYM_CHECK( + numel.sym_eq(1), + "a Tensor with ", + numel, + " elements cannot be converted to Scalar"); if (self.is_sparse()) { if (self._nnz() == 0) return Scalar(0); if (self.is_coalesced()) return at::_local_scalar_dense(self._values()); diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index 1de72abd5886..15794040bf39 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -346,17 +346,17 @@ template struct AbsSwitch {}; template -inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch) { +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch /*unused*/) { return static_cast(data); } template -inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch) { +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch /*unused*/) { return static_cast(std::abs(data)); } template -inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch) { +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch /*unused*/) { return static_cast(std::abs(at::opmath_type>(data))); } diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 7d613fc02312..451869f521df 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -145,12 +145,6 @@ #include #include -namespace at::native { - -AdvancedIndex make_info(Tensor self, IOptTensorListRef orig); - -} // namespace at::native - namespace at::meta { TORCH_META_FUNC(gather) @@ -1912,11 +1906,9 @@ Tensor& index_fill_( "This also applies to advanced indexing e.g. tensor[mask] = scalar"); } - if (!self.is_complex() && source.isComplex()) { - TORCH_CHECK( - false, - "index_fill_(): Converting complex Scalar to non-complex type is not supported"); - } + TORCH_CHECK( + self.is_complex() || !source.isComplex(), + "index_fill_(): Converting complex Scalar to non-complex type is not supported"); // Handle the case when `self` is 0-dim Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self; @@ -2682,7 +2674,7 @@ inline std::tuple _take_along_dim_helper( std::move(dim)); } -static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { +inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { TORCH_CHECK( !t.defined() || t.device() == device, "Expected tensor to have ", @@ -2695,7 +2687,7 @@ static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) { ")"); } -static inline void checkDevice( +inline void checkDevice( CheckedFrom c, at::ArrayRef tensors, Device device) { diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h index bc6c2533eac5..6f127b711d3e 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h +++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h @@ -77,7 +77,7 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { // next broadcast all index tensors together try { indices = expand_outplace(indices); - } catch (std::exception& e) { + } catch (std::exception&) { TORCH_CHECK_INDEX( false, "shape mismatch: indexing tensors could not be broadcast together" diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index d9a42da482c0..c6126eda61e7 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -73,7 +73,6 @@ #include #include -#include #include #endif @@ -847,7 +846,7 @@ TORCH_IMPL_FUNC(clamp_Tensor_out) (const Tensor& self, const OptionalTensorRef min, const OptionalTensorRef max, - const Tensor&) { + const Tensor& /*unused*/) { if (min && max) { clamp_stub(device_type(), *this); } else if (min) { diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 4fa0556ad785..c15b082f107b 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) { return false; if (!self.is_cuda()) return false; - auto st = self.scalar_type(); - if (!(st == kDouble || st == kFloat || st == kHalf)) - return false; if (!detail::getCUDAHooks().compiledWithCuDNN()) return false; // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index c2d0856c3cd4..6df7761d822d 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1,3 +1,5 @@ +#include +#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -1878,19 +1880,18 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { Tensor xtensor = self.expand(padded_size); - Tensor result; + Tensor urtensor; if (self.is_quantized()) { - result = at::empty_quantized(target_size, self); + urtensor = at::empty_quantized(target_size, self); } else { - result = at::empty(target_size, self.options()); + urtensor = at::empty(target_size, self.options()); } // return an empty tensor if one of the repeat dimensions is zero if (zero_tensor) { - return result; + return urtensor; } - Tensor urtensor = at::alias(result); for (const auto i : c10::irange(xtensor.dim())) { // can't unfold with step 0, so make sure step is at least 1 // (it doesn't matter what it is in that case, because the size is 0). @@ -1900,7 +1901,22 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) { urtensor.copy_(xtensor.expand_as(urtensor)); - return result; + // Combine the dimensions to produce the target_size. + // xtensor dims: [a0, ..., ad-1] + // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1] + // b dims are produced by unfold. + // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1] + const int64_t n_dims = xtensor.dim(); + auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong)); + auto range_b = range_a + n_dims; + auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten(); + auto permutation = IntArrayRef(stacked.data_ptr(), n_dims * 2); + // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1] + urtensor = urtensor.permute(permutation); + // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1] + urtensor = urtensor.reshape(target_size); + + return urtensor; } Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) { @@ -2051,7 +2067,7 @@ Tensor _reshape_copy_symint( TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors"); } - if (self.is_contiguous()) { + if (self.is_contiguous_or_false()) { return self.view_symint(shape).clone(at::MemoryFormat::Contiguous); } else { return at::_unsafe_view_symint( @@ -3625,7 +3641,7 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) { namespace { // Transpose implementation for sparse compressed layouts // NB: We assume that dim1,dim0 have already been wrapped -static inline Tensor sparse_compressed_transpose( +inline Tensor sparse_compressed_transpose( const Tensor& self, int64_t dim0, int64_t dim1) { diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h index 3030cb54aea6..156d2c8974b8 100644 --- a/aten/src/ATen/native/UnfoldBackward.h +++ b/aten/src/ATen/native/UnfoldBackward.h @@ -29,7 +29,7 @@ namespace { // grad_in does not mean that it is a gradient wrt to input, // grad_in/grad_out is just an input/output of unfold_backward kernel. -[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out( +[[maybe_unused]] TensorIterator _make_unfold_backward_iter_over_grad_out( Tensor& grad_out, const Tensor& grad_in, int64_t dim, diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index d9e1bf30c6a6..b14079e7ea19 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -124,7 +124,7 @@ struct IsUnique {}; template struct IsUnique { - inline bool operator() (scalar_t* data_ptr, int64_t i) { + bool operator() (scalar_t* data_ptr, int64_t i) { if (i == 0) { return true; } return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]); } @@ -132,7 +132,7 @@ struct IsUnique { template struct IsUnique { - inline bool operator() (scalar_t* data_ptr, int64_t i) { + bool operator() (scalar_t* data_ptr, int64_t i) { if (i == 0) { return true; } return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1])) && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1])); diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index 5b49fdd02954..cf6727c2207c 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -407,7 +406,7 @@ scalar_t cubic_convolution2(scalar_t x, scalar_t A) { } template -void get_cubic_upsample_coefficients( +static inline void get_cubic_upsample_coefficients( scalar_t coeffs[4], scalar_t t) { scalar_t A = -0.75; diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index b02d809bb57a..3ab8795f6dca 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -105,7 +105,7 @@ namespace at::native { namespace { template -static void upsample_bicubic2d_backward_out_frame( +void upsample_bicubic2d_backward_out_frame( const scalar_t* odata, scalar_t* idata, int64_t input_height, @@ -177,7 +177,7 @@ static void upsample_bicubic2d_backward_out_frame( }); } -static void upsample_bicubic2d_backward_kernel( +void upsample_bicubic2d_backward_kernel( const Tensor& grad_input, const Tensor& grad_output_, IntArrayRef output_size, diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp index 8c8ad45acc44..02c798a3d040 100644 --- a/aten/src/ATen/native/VariableMethodStubs.cpp +++ b/aten/src/ATen/native/VariableMethodStubs.cpp @@ -25,11 +25,11 @@ namespace at::native { void _backward(const Tensor& self, TensorList inputs, const std::optional& gradient_opt, std::optional keep_graph, bool create_graph) { - return self._backward(inputs, gradient_opt, keep_graph, create_graph); + self._backward(inputs, gradient_opt, keep_graph, create_graph); } void set_data(Tensor& self, const Tensor& new_data) { - return self.set_data(new_data); + self.set_data(new_data); } Tensor data(const Tensor& self) { @@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) { } void retain_grad(Tensor& self) { - return self.retain_grad(); + self.retain_grad(); } bool retains_grad(const Tensor& self) { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp index f528dd14adb0..0773217c90a4 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp @@ -39,6 +39,6 @@ int register_linear_params() { } namespace { -[[maybe_unused]] static auto linear_params = register_linear_params(); +[[maybe_unused]] auto linear_params = register_linear_params(); } // namespace } // namespace ao::sparse diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp index ab2da21d4b58..9bb8fbdb0e05 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp @@ -17,7 +17,7 @@ namespace ao::sparse { -int register_linear_params(); + #ifdef USE_FBGEMM diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp index 8c3a93289c10..968e58d591c1 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp @@ -20,7 +20,7 @@ namespace ao::sparse { -int register_linear_params(); + #ifdef USE_FBGEMM namespace { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp index bda1984d6207..b9cffe5b0bcb 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp @@ -16,7 +16,7 @@ #endif namespace ao::sparse { -int register_linear_params(); + #ifdef USE_FBGEMM diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index 00c9f4eb2534..bc9b452bc687 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -30,7 +30,7 @@ namespace { // Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON __attribute__((optimize("no-tree-vectorize"))) #endif -static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { +void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { if (at::isReducedFloatingType(input.scalar_type())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { using Vec = Vectorized; @@ -96,7 +96,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const } } -static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { +void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() { using Vec = Vectorized; @@ -150,7 +150,7 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { } } -static void threshold_kernel( +void threshold_kernel( TensorIteratorBase& iter, const Scalar& threshold_scalar, const Scalar& value_scalar) { @@ -868,7 +868,7 @@ void hardswish_backward_kernel(TensorIterator& iter) { } } -static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&]() { auto zero_vec = Vectorized((float)(0)); @@ -907,7 +907,7 @@ static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { } } -static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) { +void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) { if (at::isReducedFloatingType(iter.dtype())) { AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&]() { auto zero_vec = Vectorized((float)(0)); diff --git a/aten/src/ATen/native/cpu/AtomicAddFloat.h b/aten/src/ATen/native/cpu/AtomicAddFloat.h index 5b24ee4821c4..526f86d705b7 100644 --- a/aten/src/ATen/native/cpu/AtomicAddFloat.h +++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h @@ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue) old_value.floatV = *dst; new_value.floatV = old_value.floatV + fvalue; - unsigned* old_intV = (unsigned*)(&old_value.intV); + unsigned* old_intV = &old_value.intV; while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) { #ifdef __aarch64__ __asm__ __volatile__("yield;" : : : "memory"); diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 3db9646b31c4..10e0daacab33 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) { // In the special case of unsigned integer division, floor division is // equivalent to truncation division (since the signs of the divisor and // dividend are always the same) - return div_trunc_kernel(iter); + div_trunc_kernel(iter); + return; } else if (isIntegralType(dtype, /*includeBool*/ false)) { // There's no SIMD integer division, so don't try to vectorize it. AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() { diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index ab3b16c395a3..2e3a82ac049e 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -118,7 +118,7 @@ gemm_notrans_( scale_(m, n, beta, c, ldc); // c += alpha * (a @ b) - const uint64_t unsigned_m = static_cast(m); + const uint64_t unsigned_m = m; const uint64_t i_m = unsigned_m / 4; for (const uint64_t l : c10::irange(k)) { for (const uint64_t j : c10::irange(n)) { @@ -369,7 +369,7 @@ void gemm_notrans_( #endif // defined(__aarch64__) && !defined(C10_MOBILE) #if !defined(C10_MOBILE) -static float compute_dot(const at::Half* a, const at::Half* b, int64_t len) { +float compute_dot(const at::Half* a, const at::Half* b, int64_t len) { return at::native::CPU_CAPABILITY::fp16_dot_with_fp32_arith( a, b, len); } @@ -406,7 +406,7 @@ void gemm_transa_( }); } -static float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) { +float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) { return at::native::CPU_CAPABILITY::bf16_dot_with_fp32_arith(a, b, len); } diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index 78651bca746d..365a79ba52ca 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -15,12 +15,12 @@ namespace at::native { inline namespace CPU_CAPABILITY { namespace { -static bool reduced_input(ScalarType input_t, ScalarType output_t) { +bool reduced_input(ScalarType input_t, ScalarType output_t) { return !at::isFloat8Type(input_t) && at::isReducedFloatingType(input_t) && output_t == kFloat; } -static bool reduced_output(ScalarType input_t, ScalarType output_t) { +bool reduced_output(ScalarType input_t, ScalarType output_t) { return !at::isFloat8Type(output_t) && at::isReducedFloatingType(output_t) && input_t == kFloat; } diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp index b380ef619b40..66e49f911f68 100644 --- a/aten/src/ATen/native/cpu/CrossKernel.cpp +++ b/aten/src/ATen/native/cpu/CrossKernel.cpp @@ -15,7 +15,7 @@ namespace at::native { namespace { template -static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { int64_t total = a.numel() / 3; int64_t a_stride = a.stride(dim); int64_t b_stride = b.stride(dim); @@ -68,7 +68,7 @@ static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, }); } -static void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { +void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, result.scalar_type(), "cross", [&]() { apply_cross(result, a, b, dim); }); diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp index 6526a4308221..1f9a8ff1097d 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp @@ -452,11 +452,11 @@ void convolution_depthwise3x3_winograd_impl( #else void convolution_depthwise3x3_winograd_impl( - const Arguments&, - const float* const, - const float* const, - const float* const, - float* const) { + const Arguments& /*unused*/, + const float* const /*unused*/, + const float* const /*unused*/, + const float* const /*unused*/, + float* const /*unused*/) { } #endif /* __ARM_NEON__ */ diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index a1a7059b7d64..412d90d9e454 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -422,19 +422,19 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double }); } -static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) { +void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) { AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_backward", [&] { Dist::apply_backward_pdist(result, grad, self, p, dist); }); } -static void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) { +void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) { AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist", [&] { Dist::apply_cdist(result, x1, x2, p); }); } -static void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) { +void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) { AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_backward", [&] { Dist::apply_backward_cdist(result, grad, x1, x2, p, dist); }); diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index a61e0364579b..e3fdefb52304 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -27,7 +27,7 @@ namespace at::native { namespace { -static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { +void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::cauchy_kernel(iter, median, sigma, generator); } @@ -101,7 +101,7 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional gen) { +void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::exponential_kernel(iter, lambda, generator); } @@ -198,12 +198,12 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional gen) { +void geometric_kernel(TensorIteratorBase& iter, double p, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::geometric_kernel(iter, p, generator); } -static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { +void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::log_normal_kernel(iter, mean, std, generator); } @@ -218,12 +218,12 @@ void normal_kernel(const TensorBase &self, double mean, double std, std::optiona templates::cpu::normal_kernel(self, mean, std, generator); } -static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_from_to_kernel(iter, range, base, generator); } -static void random_kernel(TensorIteratorBase& iter, std::optional gen) { +void random_kernel(TensorIteratorBase& iter, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_kernel(iter, generator); } @@ -231,7 +231,7 @@ static void random_kernel(TensorIteratorBase& iter, std::optional gen // This is the special kernel to handle single specific case: // from(inclusive) = std::numeric_limits::lowest() // to(exclusive) = None (= std::numeric_limits::max() + 1) -static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen) { +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::random_full_64_bits_range_kernel(iter, generator); } diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 8171ae8e79ad..1f8693902a32 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -85,7 +85,7 @@ struct RandomKernel { // ==================================================== Normal ======================================================== #ifdef CPU_CAPABILITY_AVX2 -static void normal_fill_16_AVX2(float *data, +void normal_fill_16_AVX2(float *data, const __m256* two_pi, const __m256* one, const __m256* minus_two, @@ -136,7 +136,7 @@ void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, #endif template -static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { +void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { for (const auto j : c10::irange(8)) { const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. const scalar_t u2 = data[j + 8]; diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp index 4432b9ace791..5ac497139607 100644 --- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp +++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp @@ -158,14 +158,14 @@ inline void _mul_reduce_max_fusion_kernel( } template -static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { +inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { TORCH_CHECK(ptr2 == nullptr); return ptr; } template , int> = 0> -static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { +inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) { return ptr2; } diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 9450b7eca9b3..7587988528eb 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -441,7 +441,7 @@ struct ComputeLocation // See NOTE [ Grid Sample CPU Kernels ] for details. template -static inline void +inline void mask_scatter_add(const scalar_t *src, scalar_t* base_addr, const int_same_size_t *offsets, const int_same_size_t *mask, int64_t len) { @@ -1030,7 +1030,7 @@ struct ApplyGridSample -static inline void grid_sample_2d_grid_slice_iterator( +inline void grid_sample_2d_grid_slice_iterator( const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { int64_t out_H = grid_slice.size(0); int64_t out_W = grid_slice.size(1); diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index 4a16d2bb7ba9..261683a187b8 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -259,7 +259,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const std::optional& weight, bool density, +void histogramdd_kernel_impl(const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges) { histogramdd_out_cpu_template(self, weight, density, hist, bin_edges); } @@ -269,7 +269,7 @@ static void histogramdd_kernel_impl(const Tensor& self, const std::optional& weight, +void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional& weight, bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) { if (local_search) { // histogramdd codepath: both hist and bin_edges are eventually returned as output, @@ -298,7 +298,7 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N, std::copy(max_data, max_data + N, rightmost_edges.begin()); } -static void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N, +void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N, std::vector &leftmost_edges, std::vector &rightmost_edges) { AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "histogramdd", [&]() { infer_bin_edges_from_input(input, N, leftmost_edges, rightmost_edges); diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp index 1e6723b5f08b..57d3ab89c617 100644 --- a/aten/src/ATen/native/cpu/IndexKernel.cpp +++ b/aten/src/ATen/native/cpu/IndexKernel.cpp @@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { // }); if (iter_dtype == kByte) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kChar) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kInt) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kLong) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kShort) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kBool) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kFloat) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } else if (iter_dtype == kDouble) { - return cpu_hflip_vec(iter); + cpu_hflip_vec(iter); + return; } } // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below) @@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { c == input_strides_2[1] && c == iter.element_size(0) * iter.shape()[0] // checks if dim=1 is contiguous as well ) { - return cpu_hflip_channels_last_vec(iter); + cpu_hflip_channels_last_vec(iter); + return; } // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec) - return cpu_vflip_memcpy(iter); + cpu_vflip_memcpy(iter); + return; } AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu", diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index 83b51a998563..aad618a258a3 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -46,7 +46,7 @@ using namespace vec; template typename traits::ArgsTuple dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, - std::index_sequence) { + std::index_sequence /*unused*/) { return std::make_tuple( c10::load::type>( data[INDEX] + i * strides[INDEX])...); @@ -65,7 +65,7 @@ dereference_vec_impl(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i, - std::index_sequence) { + std::index_sequence /*unused*/) { using Vec = typename traits::result_type; using scalar_t = typename Vec::value_type; return std::make_tuple( @@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve template inline void unroll_contiguous_scalar_checks( const int64_t* /*strides*/, - std::index_sequence<>, + std::index_sequence<> /*unused*/, cb_t&& cb) { cb(0); } @@ -239,7 +239,7 @@ inline void unroll_contiguous_scalar_checks( template inline void unroll_contiguous_scalar_checks( const int64_t* strides, - std::index_sequence, + std::index_sequence /*unused*/, cb_t&& cb) { if (is_contiguous_scalar(strides)) { cb(INDEX0 + 1); diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index b75acf4ffc24..7ea8e87e28b1 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -210,7 +210,7 @@ multinomial_with_replacement_apply( } } -static void multinomial_with_replacement_kernel_impl( +void multinomial_with_replacement_kernel_impl( Tensor& result, const Tensor& self, const int64_t n_sample, diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp index 59d838b9782d..853fc959f634 100644 --- a/aten/src/ATen/native/cpu/PaddingKernel.cpp +++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp @@ -96,7 +96,7 @@ struct ReplicationPad { }; template -static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { +inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { @@ -112,7 +112,7 @@ static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) { } template -static inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) { +inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) { using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index a9d6db2c0382..6fad9270bf19 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -9,7 +9,7 @@ namespace at::native { namespace { -static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { ScalarType dtype = iter.common_dtype(); if (at::isReducedFloatingType(dtype)) { AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcmul_cpu_out", [&]() { @@ -50,7 +50,7 @@ static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { } } -static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { +void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { ScalarType dtype = iter.common_dtype(); if (at::isReducedFloatingType(dtype)) { AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcdiv_cpu_out", [&]() { @@ -90,7 +90,7 @@ static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) { } } -static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { +void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) { ScalarType dtype = iter.dtype(0); if (dtype == kBFloat16) { auto norm_val = norm.to(); @@ -176,7 +176,7 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no } } -static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) { +void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) { ScalarType dtype = iter.dtype(0); AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "huber_backward_cpu_out", [&] { auto norm_val = norm.to(); @@ -215,7 +215,7 @@ static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, }); } -static void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) { +void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) { ScalarType dtype = iter.dtype(0); AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "mse_backward_cpu_out", [&] { scalar_t scalar_val = value.to(); diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp index 2cf751f05116..18e14ed5d30d 100644 --- a/aten/src/ATen/native/cpu/PowKernel.cpp +++ b/aten/src/ATen/native/cpu/PowKernel.cpp @@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel( dtype == kBFloat16 || isComplexType(dtype)) { // Dispatch to fast specialization for sqrt, rsqrt and reciprocal if (exp_scalar.equal(.5)) { - return sqrt_kernel(iter); + sqrt_kernel(iter); + return; } else if (exp_scalar.equal(-0.5)) { - return rsqrt_kernel(iter); + rsqrt_kernel(iter); + return; } else if (exp_scalar.equal(-1.0)) { - return reciprocal_kernel(iter); + reciprocal_kernel(iter); + return; } } diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp index ee9396136612..b469aa5c2eee 100644 --- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp +++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp @@ -18,7 +18,7 @@ namespace { using namespace vec; -static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) { +void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) { AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "arange_cpu", [&]() { using accscalar_t = at::acc_type; auto start = scalar_start.to(); @@ -42,7 +42,7 @@ static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, cons }); } -static void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) { +void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "linspace_cpu", [&]() { // step should be of double type for all integral types using step_t = std::conditional_t, double, scalar_t>; diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp index a53fe53a8457..c7eaa802af12 100644 --- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp @@ -62,7 +62,7 @@ inline void reduce_all_impl( output.fill_(result); } -static void min_all_kernel_impl(Tensor& result, const Tensor& input) { +void min_all_kernel_impl(Tensor& result, const Tensor& input) { if (input.scalar_type() == ScalarType::Bool) { TensorIterator iter = TensorIteratorConfig() .add_input(input) @@ -87,7 +87,7 @@ static void min_all_kernel_impl(Tensor& result, const Tensor& input) { } } -static void max_all_kernel_impl(Tensor& result, const Tensor& input) { +void max_all_kernel_impl(Tensor& result, const Tensor& input) { if (input.scalar_type() == ScalarType::Bool) { TensorIterator iter = TensorIteratorConfig() .add_input(input) @@ -167,7 +167,7 @@ inline void reduce_all_impl_vec_two_outputs( output2.fill_(result.second); } -static void aminmax_allreduce_kernel( +void aminmax_allreduce_kernel( const Tensor& input, Tensor& min_result, Tensor& max_result) { diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index c06731dfc718..2e6293650194 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -28,7 +28,7 @@ namespace at::native { namespace { using namespace vec; template -static inline void cpu_cum_base_kernel(const Tensor& result, +inline void cpu_cum_base_kernel(const Tensor& result, const Tensor& self, int64_t dim, const func_t& f, @@ -76,7 +76,7 @@ static inline void cpu_cum_base_kernel(const Tensor& result, iter.for_each(loop, grain_size); } -static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -95,7 +95,7 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t }); } -static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { +void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) { auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -114,7 +114,7 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t }); } -static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) { +void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) { auto wrap_dim = maybe_wrap_dim(dim, self.dim()); int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); @@ -135,7 +135,7 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t }); } -static void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) { +void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) { AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] { binary_kernel_reduce( iter, @@ -148,7 +148,7 @@ static void std_var_kernel_impl(TensorIterator& iter, double correction, bool ta }); } -static void prod_kernel_impl(TensorIterator& iter) { +void prod_kernel_impl(TensorIterator& iter) { // Workaround for the error: '*' in boolean context, suggest '&&' instead if (iter.dtype() == ScalarType::Bool) { using scalar_t = bool; @@ -203,7 +203,7 @@ void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) { } } -static void norm_kernel_tensor_iterator_impl( +void norm_kernel_tensor_iterator_impl( TensorIterator& iter, const Scalar& p) { double val = 0; @@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl( } else { if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel - return norm_kernel_cpu_impl(iter, val); + norm_kernel_cpu_impl(iter, val); return; } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) { // type promotion that does cast and reduction in a single kernel - return norm_kernel_cpu_impl(iter, val); + norm_kernel_cpu_impl(iter, val); return; } AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] { @@ -274,7 +274,7 @@ static void norm_kernel_tensor_iterator_impl( } } -static void and_kernel_impl(TensorIterator& iter) { +void and_kernel_impl(TensorIterator& iter) { if (iter.dtype() == ScalarType::Byte) { // Refer [all, any : uint8 compatibility] binary_kernel_reduce_vec( @@ -312,7 +312,7 @@ static void and_kernel_impl(TensorIterator& iter) { } } -static void or_kernel_impl(TensorIterator& iter) { +void or_kernel_impl(TensorIterator& iter) { if (iter.dtype() == ScalarType::Byte) { // Refer [all, any : uint8 compatibility] binary_kernel_reduce_vec( @@ -346,7 +346,7 @@ struct MinValuesOps: public at::native::MinOps { } }; -static void min_values_kernel_impl(TensorIterator& iter) { +void min_values_kernel_impl(TensorIterator& iter) { if (iter.dtype() == kLong) { // This case is special because of Vectorized does not // handle upper_bound(). @@ -367,7 +367,7 @@ static void min_values_kernel_impl(TensorIterator& iter) { }); } -static void max_values_kernel_impl(TensorIterator& iter) { +void max_values_kernel_impl(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] { binary_kernel_reduce_vec( iter, @@ -377,7 +377,7 @@ static void max_values_kernel_impl(TensorIterator& iter) { }); } -static void argmax_kernel_impl(TensorIterator &iter) { +void argmax_kernel_impl(TensorIterator &iter) { AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] { if (is_reduce_lastdim(iter)) { using arg_t = std::pair; @@ -401,7 +401,7 @@ static void argmax_kernel_impl(TensorIterator &iter) { }); } -static void argmin_kernel_impl(TensorIterator &iter) { +void argmin_kernel_impl(TensorIterator &iter) { AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] { if (is_reduce_lastdim(iter)) { using arg_t = std::pair; @@ -459,7 +459,7 @@ struct XorSumOps { } }; -static void xor_sum_kernel_impl(TensorIterator& iter) { +void xor_sum_kernel_impl(TensorIterator& iter) { // Use iter.dtype(1) to dispatch based on the type of the input tensor AT_DISPATCH_ALL_TYPES_AND3( kBFloat16, kHalf, kBool, iter.dtype(1), "xor_sum_cpu", [&] { diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h index fd7c4a2750a6..1b0be8d18db7 100644 --- a/aten/src/ATen/native/cpu/ReduceUtils.h +++ b/aten/src/ATen/native/cpu/ReduceUtils.h @@ -8,7 +8,6 @@ #include #include #include -#include namespace at::native { inline namespace CPU_CAPABILITY { diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp index ed5658f5f0f5..8d22201ed63c 100644 --- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp +++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp @@ -428,10 +428,11 @@ void fp16_gemv_trans( TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0); #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) if (at::globalContext().allowFP16ReductionCPU()) { - return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); + fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); + return; } #endif - return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); + fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy); } float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) { @@ -465,7 +466,7 @@ void bf16_gemv_trans( at::BFloat16* y, const int incy) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0); - return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy); + bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy); } float fp16_dot( diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index b6d8d684ae62..895263bc4466 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -41,7 +41,7 @@ class ReduceMultiply { *self_data = c10::load(self_data) && c10::load(src_data); } }; -static ReduceMultiply reduce_multiply; +ReduceMultiply reduce_multiply; class ReduceAdd { public: @@ -51,7 +51,7 @@ class ReduceAdd { *self_data += opmath_t(c10::load(src_data)); } }; -static ReduceAdd reduce_add; +ReduceAdd reduce_add; class ReduceMean { public: @@ -61,7 +61,7 @@ class ReduceMean { *self_data += opmath_t(c10::load(src_data)); } }; -static ReduceMean reduce_mean; +ReduceMean reduce_mean; class ReduceMaximum { public: @@ -73,7 +73,7 @@ class ReduceMaximum { *self_data = at::_isnan(src_value) ? opmath_t(src_value) : std::max(self_value, opmath_t(src_value)); } }; -static ReduceMaximum reduce_maximum; +ReduceMaximum reduce_maximum; class ReduceMinimum { public: @@ -85,7 +85,7 @@ class ReduceMinimum { *self_data = at::_isnan(src_value) ? opmath_t(src_value) : std::min(self_value, opmath_t(src_value)); } }; -static ReduceMinimum reduce_minimum; +ReduceMinimum reduce_minimum; class TensorAssign { public: @@ -95,7 +95,7 @@ class TensorAssign { *self_data = opmath_t(c10::load(src_data)); } }; -static TensorAssign tensor_assign; +TensorAssign tensor_assign; template struct _cpu_scatter_gather_dim_loop { diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index dac0f3bef25e..9ecfe55cedc4 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -17,7 +17,6 @@ #include #include #include -#include // [Note AVX-SSE transitions] In general we avoid calls into cmath for code // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in @@ -969,7 +968,7 @@ struct vec_host_softmax_backward { } }; -static void softmax_lastdim_kernel_impl( +void softmax_lastdim_kernel_impl( const Tensor& result, const Tensor& self) { AT_DISPATCH_FLOATING_TYPES_AND2( @@ -978,13 +977,13 @@ static void softmax_lastdim_kernel_impl( [&] { vec_host_softmax_lastdim::apply(result, self); }); } -static void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "softmax_kernel_impl", [&] { vec_softmax::apply(result, self, dim); }); } -static void log_softmax_lastdim_kernel_impl( +void log_softmax_lastdim_kernel_impl( const Tensor& result, const Tensor& self) { AT_DISPATCH_FLOATING_TYPES_AND2( @@ -993,13 +992,13 @@ static void log_softmax_lastdim_kernel_impl( [&] { vec_host_softmax_lastdim::apply(result, self); }); } -static void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { +void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "softmax_kernel_impl", [&] { vec_softmax::apply(result, self, dim); }); } -static void softmax_backward_lastdim_kernel_impl( +void softmax_backward_lastdim_kernel_impl( const Tensor& grad_input, const Tensor& grad, const Tensor& output) { @@ -1011,7 +1010,7 @@ static void softmax_backward_lastdim_kernel_impl( }); } -static void log_softmax_backward_lastdim_kernel_impl( +void log_softmax_backward_lastdim_kernel_impl( const Tensor& grad_input, const Tensor& grad, const Tensor& output) { @@ -1023,7 +1022,7 @@ static void log_softmax_backward_lastdim_kernel_impl( }); } -static void softmax_backward_kernel_impl( +void softmax_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad, const Tensor& output, @@ -1039,7 +1038,7 @@ static void softmax_backward_kernel_impl( }); } -static void log_softmax_backward_kernel_impl( +void log_softmax_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad, const Tensor& output, diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index b7d83d85996b..7d337c119c98 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -90,7 +90,7 @@ struct KeyValueCompDesc { }; #ifdef USE_FBGEMM -static bool can_use_radix_sort(const TensorBase& values, const bool descending) { +bool can_use_radix_sort(const TensorBase& values, const bool descending) { // radix_sort can be used only for 1D data if (values.dim() != 1) return false; // radix_sort sorts in ascending order @@ -106,7 +106,7 @@ static bool can_use_radix_sort(const TensorBase& values, const bool descending) return true; } -static void parallel_sort1d_kernel( +void parallel_sort1d_kernel( const TensorBase& values, const TensorBase& indices) { AT_DISPATCH_INTEGRAL_TYPES(values.scalar_type(), "parallel_sort1d_kernel", [&] { @@ -140,7 +140,7 @@ static void parallel_sort1d_kernel( #endif template -static inline void sort_kernel_impl(const value_accessor_t& value_accessor, +inline void sort_kernel_impl(const value_accessor_t& value_accessor, const indices_accessor_t& indices_accessor, int64_t dim_size, bool descending, bool stable) { auto composite_accessor = CompositeRandomAccessorCPU< @@ -165,7 +165,7 @@ static inline void sort_kernel_impl(const value_accessor_t& value_accessor, } } -static void sort_kernel( +void sort_kernel( const TensorBase& self, const TensorBase& values, const TensorBase& indices, @@ -222,7 +222,7 @@ static void sort_kernel( ); } -static void topk_kernel( +void topk_kernel( const TensorBase &values, const TensorBase &indices, const TensorBase &self, diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp index 32364c38ea51..0fda4ae05f3e 100644 --- a/aten/src/ATen/native/cpu/SumKernel.cpp +++ b/aten/src/ATen/native/cpu/SumKernel.cpp @@ -286,12 +286,12 @@ struct CastStoreAccumulate { }; template -static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) { +void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) { StorePolicy::store(data, stride, index, value); } template -static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +void store(char * C10_RESTRICT data, int64_t stride, int64_t index, const std::array &values) { auto *base_ptr = data + stride * index; for (const auto k : c10::irange(numel)) { @@ -301,7 +301,7 @@ static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, } template -static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, +void store(char * C10_RESTRICT data, int64_t stride, int64_t index, const Vectorized &values) { using vec_t = Vectorized; alignas(64) std::array array_values{}; diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index 2c52a61fc553..c479e1610cbe 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -29,7 +29,7 @@ namespace at::native { namespace { template -static inline void compare_base_kernel_core( +inline void compare_base_kernel_core( const Tensor& result1, const Tensor& result2, const Tensor& self, @@ -71,7 +71,7 @@ static inline void compare_base_kernel_core( } template -static inline void compare_base_kernel(const Tensor& result1, const Tensor& result2, +inline void compare_base_kernel(const Tensor& result1, const Tensor& result2, const Tensor& self, int64_t dim, bool keepdim, @@ -98,7 +98,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu result1, result2, self, dim, keepdim, loop); } -static void min_kernel_impl( +void min_kernel_impl( const Tensor& result, const Tensor& indice, const Tensor& self, @@ -131,7 +131,7 @@ static void min_kernel_impl( }); } -static void max_kernel_impl( +void max_kernel_impl( const Tensor& result, const Tensor& indice, const Tensor& self, @@ -164,7 +164,7 @@ static void max_kernel_impl( }); } -static void aminmax_kernel( +void aminmax_kernel( const Tensor& self, int64_t dim, bool keepdim, @@ -212,7 +212,7 @@ static void aminmax_kernel( }); } -static void where_kernel_impl(TensorIterator &iter) { +void where_kernel_impl(TensorIterator &iter) { AT_DISPATCH_V2( iter.dtype(), "where_cpu", [&] { cpu_kernel( @@ -224,19 +224,19 @@ static void where_kernel_impl(TensorIterator &iter) { kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_FLOAT8_TYPES)); } -static void isposinf_kernel_impl(TensorIteratorBase& iter) { +void isposinf_kernel_impl(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_cpu", [&]() { cpu_kernel(iter, [](scalar_t a) -> bool { return a == std::numeric_limits::infinity(); }); }); } -static void isneginf_kernel_impl(TensorIteratorBase& iter) { +void isneginf_kernel_impl(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_cpu", [&]() { cpu_kernel(iter, [](scalar_t a) -> bool { return a == -std::numeric_limits::infinity(); }); }); } -static void mode_kernel_impl( +void mode_kernel_impl( Tensor& values, Tensor& indices, const Tensor& self, @@ -308,7 +308,7 @@ static void mode_kernel_impl( // Default brute force implementation of isin(). Used when the number of test elements is small. // Iterates through each element and checks it against each test element. -static void isin_default_kernel_cpu( +void isin_default_kernel_cpu( const Tensor& elements, const Tensor& test_elements, bool invert, @@ -339,7 +339,7 @@ static void isin_default_kernel_cpu( }); } -static void clamp_kernel_impl(TensorIteratorBase& iter) { +void clamp_kernel_impl(TensorIteratorBase& iter) { AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_cpu", [&]() { cpu_kernel_vec(iter, [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t { @@ -355,7 +355,7 @@ static void clamp_kernel_impl(TensorIteratorBase& iter) { }); } -static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) { +void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) { AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_scalar_cpu", [&]() { const auto min = min_.to(); const auto max = max_.to(); @@ -371,7 +371,7 @@ static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min }); } -static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { +void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) { AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() { const auto max = max_.to(); const Vectorized max_vec(max); @@ -385,7 +385,7 @@ static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) }); } -static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) { +void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) { AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_min_scalar_cpu", [&]() { const auto min = min_.to(); const Vectorized min_vec(min); diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8c94decfff02..444ec10861da 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -13,7 +13,7 @@ namespace at::native { namespace { template -static inline void cadd( +inline void cadd( scalar_t* z, const scalar_t* x, const scalar_t* y, @@ -34,7 +34,7 @@ static inline void cadd( } template -static void unfolded2d_acc( +void unfolded2d_acc( scalar_t* finput_data, scalar_t* input_data, int64_t kH, @@ -113,7 +113,7 @@ static void unfolded2d_acc( } template -static void unfolded2d_acc_channels_last( +void unfolded2d_acc_channels_last( scalar_t* finput_data, scalar_t* input_data, int64_t kH, @@ -225,7 +225,7 @@ void unfolded2d_acc_kernel( } template -static void unfolded2d_copy( +void unfolded2d_copy( const scalar_t* input_data, scalar_t* finput_data, int64_t kH, @@ -240,7 +240,7 @@ static void unfolded2d_copy( int64_t output_height, int64_t output_width) { at::parallel_for( - 0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { + 0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { for (const auto k : c10::irange(start, end)) { int64_t nip = k / (kH * kW); int64_t rest = k % (kH * kW); @@ -316,7 +316,7 @@ static void unfolded2d_copy( for (int64_t x = 0; x < output_width; x++) memcpy( dst + (size_t)y * output_width + x, - src + (size_t)iy * input_width + ix + (int64_t)x * dW, + src + (size_t)iy * input_width + ix + x * dW, sizeof(scalar_t) * (1)); } } @@ -326,7 +326,7 @@ static void unfolded2d_copy( } template -static void unfolded2d_copy_channels_last( +void unfolded2d_copy_channels_last( const scalar_t* input_data, scalar_t* finput_data, int64_t kH, diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index 74fb38779ea1..e59e5985bf7f 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -157,13 +157,13 @@ struct Interpolate<1, scalar_t, opmath_t, index_t, 2> { }; template -static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { +inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { using opmath_t = at::opmath_type; return Interpolate::eval(src, data, strides, i); } template -static inline scalar_t interpolate_aa_single_dim_zero_strides( +inline scalar_t interpolate_aa_single_dim_zero_strides( char* src, char** data, const index_t ids_stride) { @@ -187,7 +187,7 @@ static inline scalar_t interpolate_aa_single_dim_zero_strides( } template -static inline scalar_t interpolate_aa_single_dim( +inline scalar_t interpolate_aa_single_dim( char* src, char** data, const int64_t* strides, @@ -213,7 +213,7 @@ static inline scalar_t interpolate_aa_single_dim( } template -static inline bool is_zero_stride(const int64_t* strides) { +inline bool is_zero_stride(const int64_t* strides) { bool output = strides[0] == 0; for (const auto i : c10::irange(1, m)) { output &= (strides[i] == 0); @@ -222,7 +222,7 @@ static inline bool is_zero_stride(const int64_t* strides) { } template -static inline bool is_contiguous_stride(const int64_t* strides) { +inline bool is_contiguous_stride(const int64_t* strides) { bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t)); for (int i=2; i<2 * interp_size; i+=2) { output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t)); @@ -282,13 +282,13 @@ struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, inte }; template -static inline bool check_almost_all_zero_stride(const int64_t* strides) { +inline bool check_almost_all_zero_stride(const int64_t* strides) { return CheckAlmostAllZeroStrides::eval(strides); } // Helper method to compute interpolation for nearest, linear, cubic modes template -static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { +inline void basic_loop(char** data, const int64_t* strides, int64_t n) { char* dst = data[0]; char* src = data[1]; for (const auto i : c10::irange(n)) { @@ -298,7 +298,7 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { } template -static inline void basic_loop_aa_vertical( +inline void basic_loop_aa_vertical( char** data, const int64_t* strides, int64_t n, @@ -354,7 +354,7 @@ inline void basic_loop_aa_vertical( } template -static inline void basic_loop_aa_horizontal( +inline void basic_loop_aa_horizontal( char** data, const int64_t* strides, int64_t n, @@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase { // We keep this structure for BC and consider as deprecated. // See HelperInterpNearestExact as replacement - static const int interp_size = 1; + static constexpr int interp_size = 1; static inline void init_indices_weights( at::ScalarType output_type, @@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest { struct HelperInterpLinear : public HelperInterpBase { - static const int interp_size = 2; + static constexpr int interp_size = 2; // Compute indices and weights for each interpolated dimension // indices_weights = { @@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase { struct HelperInterpCubic : public HelperInterpBase { - static const int interp_size = 4; + static constexpr int interp_size = 4; // Compute indices and weights for each interpolated dimension // indices_weights = { diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h index 5b545509b1d9..24eddb3e1310 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h +++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -35,7 +35,7 @@ Like PIL, Pillow is licensed under the open source HPND License namespace { -static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { +inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { int32_t v; if (i32_aligned) { v = *(const int32_t*)ptr; @@ -45,11 +45,11 @@ static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32 return _mm_cvtsi32_si128(v); } -static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { +inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); } -static inline void _write_endline_rgb_as_uint32( +inline void _write_endline_rgb_as_uint32( uint8_t* C10_RESTRICT output, uint32_t data ) { diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp index c8e0b8e86793..676e8bebcec1 100644 --- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp +++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp @@ -838,7 +838,7 @@ void dyn_quant_pack_4bit_weight_kernel( } } -static void ref_dyn_quant_matmul_4bit_channelwise_kernel( +void ref_dyn_quant_matmul_4bit_channelwise_kernel( size_t m, size_t n, size_t k, @@ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel( // Round to nearest integer const int32_t nudged_zero_point0 = lrintf(zero_point0); - int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride; + int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride; // LHS offset at the beginning of the row *((float*)(dst_ptr)) = recip_scale0; @@ -997,7 +997,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel( } } -static void ref_dyn_quant_matmul_4bit_groupwise_kernel( +void ref_dyn_quant_matmul_4bit_groupwise_kernel( size_t m, size_t n, size_t k, @@ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel( zero_point0 = (std::min)(zero_point0, qmax); const int32_t nudged_zero_point0 = lrintf(zero_point0); - int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride; + int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride; *((float*)(dst_ptr)) = recip_scale0; dst_ptr += sizeof(float); diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp index 7e2cba98ff1d..496b98261964 100644 --- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp +++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp @@ -100,7 +100,7 @@ inline void tinygemm_kernel( #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) -static inline float _mm256_reduce_add_ps(__m256& v) { +inline float _mm256_reduce_add_ps(__m256& v) { __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1); v = _mm256_add_ps(v, v1); v1 = _mm256_shuffle_ps(v, v, 0x4E); diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 1dab8c19c700..68a9582a09c1 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -105,7 +106,8 @@ c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, b } } -using at::cuda::blas::ScalingType; +using at::blas::ScalingType; +using at::blas::SwizzleType; /** * @brief Prepares matrices for CUBLAS operation @@ -285,8 +287,8 @@ static bool isSupportedHipLtROCmArch(int index) { #if ROCM_VERSION >= 60300 "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908", #endif -#if ROCM_VERSION >= 60500 - "gfx950" +#if ROCM_VERSION >= 70000 + "gfx950", "gfx1150", "gfx1151" #endif }; return at::detail::getCUDAHooks().isGPUArch(archs, index); @@ -294,7 +296,7 @@ static bool isSupportedHipLtROCmArch(int index) { #endif template -static void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) { +void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) { bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); at::cuda::tunable::GemmAndBiasParams params; @@ -1112,7 +1114,7 @@ namespace{ * - Returns Error. */ -using at::cuda::blas::ScalingType; +using at::blas::ScalingType; bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) { return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1; @@ -1124,6 +1126,17 @@ bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) { && scale.is_contiguous()); } +bool check_size_stride(const at::Tensor& scale, int dim, int size, int stride) { + // For Blockwise1x128 and Blockwise128x128, + // when the scale tensor has a dimension of size 1, the stride is effectively + // "meaningless", i.e. PyTorch decides to use a stride of 1. Thus, the regular + // stride check fails. Here, we relax the stride check when the effective + // stride is 1. + + return ( + scale.size(dim) == size && (size <= 1 || scale.stride(dim) == stride)); +} + // 1x16 blocks for packed nvfp4 data and fp8_e4m3fn scales bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) { // Multiply t.size(1) by 2 to adjust for fp4x2 packing @@ -1138,21 +1151,35 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) { bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) { // TODO: We might want to enforce some structure on the shapes of the scale // tensors - return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu - && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1), 32), 4) - && scale.is_contiguous()); + bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu + && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1), 32), 4)); + bool is_packed_fp4_path = false; +#ifdef USE_ROCM + is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu + && scale.numel() == round_up(t.size(0), 128) * round_up(ceil_div(t.size(1) * 2, 32), 4)); +#endif + return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous(); } bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) { - return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2 - && scale.size(0) == t.size(0) && scale.size(1) == ceil_div(t.size(1), 128) - && scale.stride(0) == 1 && scale.stride(1) == t.size(0)); + return ( + isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && + scale.dim() == 2 && check_size_stride(scale, 0, t.size(0), 1) && + check_size_stride( + scale, 1, ceil_div(t.size(1), 128), t.size(0))); } bool is_blockwise_128x128_scaling(const at::Tensor& t, const at::Tensor& scale) { - return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2 - && scale.size(0) == ceil_div(t.size(0), 128) && scale.size(1) == ceil_div(t.size(1), 128) - && scale.stride(0) == round_up(ceil_div(t.size(1), 128), 4) && scale.stride(1) == 1); + return ( + isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && + scale.dim() == 2 && + check_size_stride( + scale, + 0, + ceil_div(t.size(0), 128), + ceil_div(t.size(1), 128)) && + check_size_stride( + scale, 1, ceil_div(t.size(1), 128), 1)); } bool is_desired_scaling(const at::Tensor& t, const at::Tensor& scale, ScalingType desired_scaling) { @@ -1203,8 +1230,207 @@ std::pair get_joint_scaling( ); } +Tensor& +_tunable_scaled_gemm_rocm( + cublasCommonArgs& args, + const Tensor& mat1, const Tensor& mat2, + const Tensor& scale_a, const Tensor& scale_b, + const ScalingType scaling_choice_a, const ScalingType scaling_choice_b, + const std::optional& bias, + const bool use_fast_accum, + const at::ScalarType out_dtype, + Tensor& out) { +#ifdef USE_ROCM +#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B) \ + if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fn, at::Float8_e5m2, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e5m2) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2, at::Float8_e4m3fn, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2, at::Float8_e5m2, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } + AT_DISPATCH_V2(out_dtype, "_tunable_scaled_gemm", AT_WRAP([&] { + bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); + bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); + at::cuda::tunable::ScaledGemmParams params; + params.transa = args.transa; + params.transb = args.transb; + params.m = args.m; + params.n = args.n; + params.k = args.k; + params.a = args.mata->data_ptr(); + params.a_scale_ptr = args.scale_mata_ptr; + params.a_scale_dtype = args.scale_mata_dtype.value(); + params.lda = args.lda; + params.a_dtype = args.mata->scalar_type(); + params.a_scale_dtype = args.scale_mata_dtype.value(); + params.a_scaling_type = args.scaling_mata_type.value(); + params.b = args.matb->data_ptr(); + params.b_scale_ptr = args.scale_matb_ptr; + params.b_scale_dtype = args.scale_matb_dtype.value(); + params.ldb = args.ldb; + params.b_dtype = args.matb->scalar_type(); + params.b_scale_dtype = args.scale_matb_dtype.value(); + params.b_scaling_type = args.scaling_matb_type.value(); + params.bias_ptr = bias ? bias->data_ptr(): nullptr; + params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype) ? at::ScalarType::Half : out_dtype; + params.c = args.result->data_ptr(); + params.c_scale_ptr = args.scale_result_ptr; + params.ldc = args.result_ld; + params.c_dtype = out_dtype; + params.use_fast_accum = use_fast_accum; + if (transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T) + } + else if (transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N) + } + else if (!transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T) + } + else if (!transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N) + } + else { + TORCH_CHECK(false, "unreachable"); + } + }), + kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES)); +#undef TUNABLE_DISPATCH + return out; +#else + TORCH_CHECK_NOT_IMPLEMENTED(false, "_scaled_gemm_rocm only callable on ROCM devices"); +#endif +} + +Tensor& +_scaled_gemm( + const Tensor& mat1, const Tensor& mat2, + const Tensor& scale_a, const Tensor& scale_b, + const ScalingType scaling_choice_a, const ScalingType scaling_choice_b, + const std::optional& bias, + const bool use_fast_accum, + Tensor& out, + const std::optional& alpha = std::nullopt) { + cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b); + const auto out_dtype_ = args.result->scalar_type(); + TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); + +// ROCM enables the TunableOp path only +// but can fallback to at::cuda::blas::scaled_gemm +#ifdef USE_ROCM + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + bool tunable_op_enabled = tuning_ctx->IsTunableOpEnabled(); +#else + bool tunable_op_enabled = false; +#endif + if (tunable_op_enabled) { + // Only available on ROCM + return _tunable_scaled_gemm_rocm( + args, + mat1, mat2, + scale_a, scale_b, + scaling_choice_a, scaling_choice_b, + bias, + use_fast_accum, + out_dtype_, + out); + } + else + { + at::cuda::blas::scaled_gemm( + args.transa, + args.transb, + args.m, + args.n, + args.k, + args.mata->data_ptr(), + args.scale_mata_ptr, + args.lda, + args.mata->scalar_type(), + args.scale_mata_dtype.value(), + args.scaling_mata_type.value(), + args.matb->data_ptr(), + args.scale_matb_ptr, + args.ldb, + args.matb->scalar_type(), + args.scale_matb_dtype.value(), + args.scaling_matb_type.value(), + bias ? bias->data_ptr(): nullptr, + bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_, + args.result->data_ptr(), + args.scale_result_ptr, + args.result_ld, + out_dtype_, + use_fast_accum, + alpha); + return out; + } +} + } // namespace +// NOTE(slayton58): This is defined as part of the _v2 code (way) below - declare the signature here +// to help cleanup v1 call structure. +Tensor& +_scaled_rowwise_rowwise( + const Tensor&, const Tensor&, + const Tensor&, const Tensor&, + const std::optional&, + const c10::ScalarType, + bool, + Tensor&); + + // Computes matrix multiply + bias while applying scaling to input and output matrices // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default. // If output matrix type is 16 or 32-bit type, scale_result is not applied. @@ -1246,6 +1472,10 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, // by decreasing priority. We prefer "simpler" schemes as they are supported // more broadly (more GPU archs, more CUDA versions) and because they are more // efficient. This tends to matter only for small matmuls (e.g., 1x1x128). + + // List of supported BlockWise pairs for FP8: + // https://docs.nvidia.com/cuda/cublas/#element-1d-and-128x128-2d-block-scaling-for-fp8-data-types + auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling( { std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise), @@ -1278,7 +1508,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type()); #ifndef USE_ROCM // Type restrictions imposed by CuBLASLt as of CUDA-12.1 - TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2, + TORCH_CHECK_VALUE(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2, "Multiplication of two Float8_e5m2 matrices is not supported"); #endif if (use_fast_accum) { @@ -1344,200 +1574,66 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9, // and only for compute capability 9.0+. In other cases we use CUTLASS. -#ifndef USE_ROCM // We are doing row-wise scaling - auto dprops = at::cuda::getCurrentDeviceProperties(); - if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise - && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900) - // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales - || (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) { - TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); - at::cuda::detail::f8f8bf16_rowwise( - mat1, - mat2, - scale_a, - scale_b, - bias, - use_fast_accum, - out); - return out; - } -#else if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise) { +#ifndef USE_ROCM + auto dprops = at::cuda::getCurrentDeviceProperties(); + if ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900) + // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales + || (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty()))) { + TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); + return _scaled_rowwise_rowwise( + mat1, + mat2, + scale_a, + scale_b, + bias, + out.scalar_type(), + use_fast_accum, + out); + } +#else // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes. Tensor b = mat2; if (_scaled_mm_is_fnuz()) { - TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz); + TORCH_CHECK_VALUE(b.dtype() == at::kFloat8_e4m3fnuz, + "Expected b.dtype() == at::kFloat8_e4m3fnuz, got: ", b.dtype()); } else { - TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn); + TORCH_CHECK_VALUE(b.dtype() == at::kFloat8_e4m3fn, + "Expected b.dtype() == at::kFloat8_e4m3fn, got: ", b.dtype()); } // Until more than bf16 is supported. - TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16, + TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16, "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type()); +#endif } else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) { +#ifdef USE_ROCM #if ROCM_VERSION >= 70000 - TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), + TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); - TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 && - mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0, - "Matrix dimensions must be multiples of 32 for block-wise scaling"); + int packed_factor = 1; + if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) { + // For float4 data type, each byte stores two 4-bit floating-point values, + // effectively packing two elements into one byte. + packed_factor = 2; + } + TORCH_CHECK_VALUE(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 && + mat2.size(1) % 16 == 0, + "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling"); - TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 || + TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 || out.scalar_type() == ScalarType::Half, "Block-wise scaling only supports BFloat16 or Half output types"); #else - TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); -#endif - } + TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); #endif - - cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b); - const auto out_dtype_ = args.result->scalar_type(); - TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); - -#ifdef USE_ROCM - auto tuning_ctx = at::cuda::tunable::getTuningContext(); - if (tuning_ctx->IsTunableOpEnabled()) { -#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B) \ - if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ - if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - } \ - else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ - if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - } \ - else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) { \ - if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e4m3fn, at::Float8_e5m2, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - } \ - else if (mat1.scalar_type() == ScalarType::Float8_e5m2) { \ - if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e5m2, at::Float8_e4m3fn, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - else if (mat2.scalar_type() == ScalarType::Float8_e5m2) { \ - static at::cuda::tunable::ScaledGemmTunableOp< \ - at::Float8_e5m2, at::Float8_e5m2, scalar_t, \ - BLASOP_A, BLASOP_B> scaledgemm{}; \ - scaledgemm(¶ms); \ - } \ - } - AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] { - bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); - bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); - at::cuda::tunable::ScaledGemmParams params; - params.transa = args.transa; - params.transb = args.transb; - params.m = args.m; - params.n = args.n; - params.k = args.k; - params.a = args.mata->data_ptr(); - params.a_scale_ptr = args.scale_mata_ptr; - params.a_scale_dtype = args.scale_mata_dtype.value(); - params.lda = args.lda; - params.a_dtype = args.mata->scalar_type(); - params.a_scale_dtype = args.scale_mata_dtype.value(); - params.a_scaling_type = args.scaling_mata_type.value(); - params.b = args.matb->data_ptr(); - params.b_scale_ptr = args.scale_matb_ptr; - params.b_scale_dtype = args.scale_matb_dtype.value(); - params.ldb = args.ldb; - params.b_dtype = args.matb->scalar_type(); - params.b_scale_dtype = args.scale_matb_dtype.value(); - params.b_scaling_type = args.scaling_matb_type.value(); - params.bias_ptr = bias ? bias->data_ptr(): nullptr; - params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_; - params.c = args.result->data_ptr(); - params.c_scale_ptr = args.scale_result_ptr; - params.ldc = args.result_ld; - params.c_dtype = out_dtype_; - params.use_fast_accum = use_fast_accum; - if (transa_ && transb_) { - TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T) - } - else if (transa_ && !transb_) { - TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N) - } - else if (!transa_ && transb_) { - TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T) - } - else if (!transa_ && !transb_) { - TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N) - } - else { - TORCH_CHECK(false, "unreachable"); - } - }), - kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES)); -#undef TUNABLE_DISPATCH - } - else #endif - { - at::cuda::blas::scaled_gemm( - args.transa, - args.transb, - args.m, - args.n, - args.k, - args.mata->data_ptr(), - args.scale_mata_ptr, - args.lda, - args.mata->scalar_type(), - args.scale_mata_dtype.value(), - args.scaling_mata_type.value(), - args.matb->data_ptr(), - args.scale_matb_ptr, - args.ldb, - args.matb->scalar_type(), - args.scale_matb_dtype.value(), - args.scaling_matb_type.value(), - bias ? bias->data_ptr(): nullptr, - bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_, - args.result->data_ptr(), - args.scale_result_ptr, - args.result_ld, - out_dtype_, - use_fast_accum); } - return out; + return _scaled_gemm(mat1, mat2, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); } namespace { @@ -1648,197 +1744,1330 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, bool use_fast_accum) { const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); } +/** + * Track concrete implementations available + */ +enum class ScaledGemmImplementation { + NONE = 0, + TENSORWISE_TENSORWISE = 1, + ROWWISE_ROWWISE = 2, + BLOCK_128x128_1x128 = 3, + BLOCK_1x128_128x128 = 4, + BLOCK_1x128_1x128 = 5, + MXFP8_MXFP8 = 6, + NVFP4_NVFP4 = 7, + NVFP4_NVFP4_SINGLE_SCALE = 8, + MXFP4_MXFP4 = 9, +}; -Tensor -_scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, -const Tensor& scale_a, const Tensor& scale_b, -const std::optional& offs, -const std::optional& bias, -const std::optional& scale_result, -std::optional out_dtype, -bool use_fast_accum) { - bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); - TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+"); +/** + * Convert passed int (enum) from python back into a + * strictly-typed enum + */ +template +std::vector convert_int_to_enum(ArrayType& v) { + std::vector converted; + converted.reserve(v.size()); - TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); - TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); - TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); - TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); - const bool a_is_2d = mat_a.dim() == 2; - const bool b_is_2d = mat_b.dim() == 2; - if (!a_is_2d || !b_is_2d) { - TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + for (auto vi : v) { + converted.push_back(static_cast(vi)); } - TORCH_CHECK( - mat_a.size(-1) % 16 == 0, - "Expected trailing dimension of mat_a to be divisible by 16 ", - "but got mat1 shape: (", - mat_a.sizes(), - ")."); - TORCH_CHECK(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0, - "Expected mat_b shape to be divisible by 16 ", - "but got mat_b shape: (", - mat_b.sizes(), - ")."); - + return converted; +} - TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); - TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet"); - TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix"); +/** + * Both inputs must be fp8, + * Each needs a single scale, {Tensorwise (float)} + */ +bool check_tensorwise_recipe(c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) { + return false; + } - if (offs.has_value()) { - TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); - TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + // 1 scale each, {Tensorwise, float} + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; } + // Need {Blockwise_1x32, e8m0} for A & B + if (recipe_a[0] != ScalingType::TensorWise) return false; + if (scales_a[0].scalar_type() != ScalarType::Float) return false; + if (recipe_b[0] != ScalingType::TensorWise) return false; + if (scales_b[0].scalar_type() != ScalarType::Float) return false; - // FP8 per-tensor and per-row scaling expect fp32 scales. - // MXFP8 expects float8_e8m0fnu scales. - TORCH_CHECK( - (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) || - (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu), - "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors."); + return true; +} - const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; - check_scale(mat_a, scale_a, 0 ,0, scale_multiplier); - check_scale(mat_b, scale_b, 1, 1, scale_multiplier); +/** + * Both inputs must be fp8, + * Each needs scales, {Rowwise (float)} + */ +bool check_rowwise_recipe(c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) { + return false; + } - const auto out_dtype_ = out_dtype.value_or(kBFloat16); - TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + // 1 scale each, {Tensorwise, float} + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; + } - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + // Need {RowWise, dp32} for A & B + if (recipe_a[0] != ScalingType::RowWise) return false; + if (scales_a[0].scalar_type() != ScalarType::Float) return false; + if (recipe_b[0] != ScalingType::RowWise) return false; + if (scales_b[0].scalar_type() != ScalarType::Float) return false; -#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM) - // MXFP8 grouped GEMM dispatching - bool is_mx8mx8bf16 = ( - mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn && - scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu - ); - TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm"); + return true; +} - if (is_mx8mx8bf16) { - bool b_is_3d = mat_b.dim() == 3; - bool is_2d_2d = a_is_2d && b_is_2d; - bool is_2d_3d = a_is_2d && b_is_3d; - TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases"); - TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets"); - fbgemm_gpu::mx8mx8bf16_grouped_mm( - mat_a, - mat_b, - scale_a, - scale_b, - offs.value(), - out); - return out; +/** + * Two-level scaling, canonical NVFP4 + * Both inputs must be fp4 + * A, B need 2 scales, {Blockwise_1x16 (e4m3), Tensorwise (fp32)} + */ +bool check_nvfp4_recipe(c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp4 + if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) { + return false; } -#endif - -#ifndef USE_ROCM - TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type()); - TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type()); - - at::cuda::detail::f8f8bf16_grouped_mm( - mat_a, - mat_b, - scale_a, - scale_b, - offs, - bias, - use_fast_accum, - out); - return out; -#else -#ifdef USE_FBGEMM_GENAI - TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type()); - TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type()); - fbgemm_gpu::f8f8bf16_rowwise_grouped_mm( - mat_a, - // FBGEMM expects B matrix shape to be (.., N, K) - mat_b.transpose(-2, -1), - scale_a, - scale_b, - offs, - out); - return out; -#else - TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM") -#endif + // 2 scales, 2 recipes for each input + if (scales_a.size() != 2 || recipe_a.size() != 2 || scales_b.size() != 2 || recipe_b.size() != 2) { + return false; + } -#endif + // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]} + if (recipe_a[0] != ScalingType::BlockWise1x16 || recipe_a[1] != ScalingType::TensorWise) return false; + if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_a[1].scalar_type() != ScalarType::Float) return false; + if (recipe_b[0] != ScalingType::BlockWise1x16 || recipe_b[1] != ScalingType::TensorWise) return false; + if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_b[1].scalar_type() != ScalarType::Float) return false; + return true; } -Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, -const std::optional& offs, -const std::optional& bias, -std::optional out_dtype) { - _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); - bool a_b_and_out_are_bf16 = ( - mat_a.dtype() == at::kBFloat16 && - mat_b.dtype() == at::kBFloat16 && - out_dtype.value_or(at::kBFloat16) == at::kBFloat16 - ); -#ifndef USE_ROCM - bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; -#else - // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. - // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm - bool use_fast_path = false; -#endif - const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); - if (use_fast_path) { - // fast path, no d2h sync needed - at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); - } else { - _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); +/** + * Single-level scaling, what PyT currently understands + * Both inputs must be fp4 + * A, B need 1 scale, {Blockwise_1x16 (e4m3)} + */ +bool check_nvfp4_recipe_single_scale + (c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp4 + if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) { + return false; } - return out; -} -Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { - IntArrayRef batch1_sizes = batch1.sizes(); - IntArrayRef batch2_sizes = batch2.sizes(); + // 2 scales, 2 recipes for each input + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; + } - Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype)); - return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out); -} + // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]} + if (recipe_a[0] != ScalingType::BlockWise1x16) return false; + if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn) return false; + if (recipe_b[0] != ScalingType::BlockWise1x16) return false; + if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn) return false; -Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) { - TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); + return true; +} - TORCH_CHECK(out_dtype == batch1.scalar_type() || - (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), - "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); +/** + * Both inputs must be fp8 + * A, B must only have 1 scale each, A: {Blockwise_1x128 (float), B: {Blockwise_128x128 (float) + */ +bool check_deepseek_recipe(ScalingType expected_recipe_a, + ScalingType expected_recipe_b, + c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) { + return false; + } - Scalar beta(0.0); - Scalar alpha(1.0); - { - NoNamesGuard guard; - baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha); + // 1 scales, 1 recipes for each input + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; } - return out; + // Need {Blockwise_1x128, float} for A, {Blockwise_128x128, float} for B + if (recipe_a[0] != expected_recipe_a) return false; + if (scales_a[0].scalar_type() != ScalarType::Float) return false; + if (recipe_b[0] != expected_recipe_b) return false; + if (scales_b[0].scalar_type() != ScalarType::Float) return false; + + return true; } -Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) { - // We need to copy the tensor - Tensor out = self.clone().to(self.options().dtype(out_dtype)); +/** + * Both inputs must be fp8 + * A, B must have 1 scale each, {Blockwise_1x32, e8m0} + */ +bool check_mxfp8_recipe(c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp8 + if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) { + return false; + } - return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out); -} + // 1 scales, 1 recipes for each input + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; + } -Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { - TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); + // Need {Blockwise_1x32, e8m0} for A & B + if (recipe_a[0] != ScalingType::BlockWise1x32) return false; + if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false; + if (recipe_b[0] != ScalingType::BlockWise1x32) return false; + if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false; - TORCH_CHECK(out_dtype == batch1.scalar_type() || - (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), + return true; +} + +/** + * Both inputs must be fp4 + * A, B must have 1 scale each, {Blockwise_1x32, e8m0} + */ +bool check_mxfp4_recipe(c10::ScalarType type_a, + std::vector& recipe_a, + ArrayRef& scales_a, + c10::ScalarType type_b, + std::vector& recipe_b, + ArrayRef& scales_b) { + // both types must be fp4 + if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) { + return false; + } + + // 1 scales, 1 recipes for each input + if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) { + return false; + } + + // Need {Blockwise_1x32, e8m0} for A & B + if (recipe_a[0] != ScalingType::BlockWise1x32) return false; + if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false; + if (recipe_b[0] != ScalingType::BlockWise1x32) return false; + if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false; + + return true; +} + +using acceptance_fn = std::function&, ArrayRef&, c10::ScalarType, std::vector&, ArrayRef&)>; +using namespace std::placeholders; + +std::array, 9> scale_kernel_dispatch = {{ + { "tensorwise_tensorwise", check_tensorwise_recipe, ScaledGemmImplementation::TENSORWISE_TENSORWISE }, + { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE}, + { "block_1x128_128x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise128x128, _1, _2, _3, _4, _5, _6), + ScaledGemmImplementation::BLOCK_1x128_128x128}, + { "block_128x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise128x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6), + ScaledGemmImplementation::BLOCK_128x128_1x128}, + { "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6), + ScaledGemmImplementation::BLOCK_1x128_1x128}, + { "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}, + { "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE }, + { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}, + { "mxfp4_mxfp4", check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4}}}; + +Tensor& +_scaled_tensorwise_tensorwise( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32 + // + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + TORCH_CHECK_VALUE(scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, "scale_a must have 1 Float element") + TORCH_CHECK_VALUE(scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, "scale_b must have 1 Float element") + + auto scaling_choice_a = ScalingType::TensorWise; + auto scaling_choice_b = ScalingType::TensorWise; + + _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); + + return out; +} + + +Tensor& +_scaled_rowwise_rowwise( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32, shape M/N for A/B + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + TORCH_CHECK_VALUE(scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, "scale_a must have shape [", mat_a.size(0), ", 1], got [", scale_a.sizes(), "]"); + TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel()) + TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel()) + + TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)); + TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1)); + + auto scaling_choice_a = ScalingType::RowWise; + auto scaling_choice_b = ScalingType::RowWise; + // + // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9, + // and only for compute capability 9.0+. In other cases we use CUTLASS. +#ifndef USE_ROCM + // We are doing row-wise scaling + auto dprops = at::cuda::getCurrentDeviceProperties(); + if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900) + // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales + || (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) { + TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling."); + at::cuda::detail::f8f8bf16_rowwise( + mat_a, + mat_b, + scale_a, + scale_b, + bias, + use_fast_accum, + out); + return out; + } +#else + + // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes. + //Tensor b = mat_b; + if (_scaled_mm_is_fnuz()) { + TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "expected mat_b.dtype() to be at::kFloat8_e4m3fnuz, but got ", mat_b.dtype()); + } + else { + TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "expected mat_b.dtype() to be at::kFloat8_e4m3fn, but got ", mat_b.dtype()); + } + // Until more than bf16 is supported. + TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16, + "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type()); +#endif + + _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); + + return out; +} + +// Check the shapes & sizes of scales for deepseek-style (1x128, 128x128) scaling. +// Wraps check_size_stride for easier integration, correctly handles cases where a dimension of the scale == 1, +// and strides become somewhat meaningless +void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const ScalingType scale_type) { + if (scale_type == ScalingType::BlockWise1x128) { + TORCH_CHECK_VALUE(check_size_stride(scale, 0, t.size(0), 1), + "at dim=0 scale should have ", t.size(0), "elements and stride(0) ", 1, "if ", t.size(0), " > 1 - Got: ", + "shape=", scale.sizes(), ", stride=", scale.strides()); + auto expected_size = ceil_div(t.size(1), 128); + TORCH_CHECK_VALUE(check_size_stride(scale, 1, expected_size, t.size(0)), + "at dim=1 scale should have ", expected_size, "elements and stride ", t.size(0), "if ", expected_size, " > 1 - Got: ", + "shape=", scale.sizes(), ", stride=", scale.strides()); + } else if (scale_type == ScalingType::BlockWise128x128) { + TORCH_CHECK_VALUE(check_size_stride( + scale, + 0, + ceil_div(t.size(0), 128), + ceil_div(t.size(1), 128)), + "at dim=0 scale should have ", ceil_div(t.size(0), 128), "elements and stride(0) ", ceil_div(t.size(1), 128), "if ", ceil_div(t.size(0), 128), " > 1 - Got: ", + "shape=", scale.sizes(), ", stride=", scale.strides()); + TORCH_CHECK(check_size_stride( + scale, 1, ceil_div(t.size(1), 128), 1), + "at dim=1 scale should have ", ceil_div(t.size(1), 128), "elements and stride(1) ", 1, "if ", ceil_div(t.size(1), 128), " > 1 - Got: ", + "shape=", scale.sizes(), ", stride=", scale.strides()); + } +} + +Tensor& +_scaled_block1x128_block1x128( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + const bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32, shape K//128 + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat, + "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes()) + TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat, + "scale_b must have shape ", ceil_div(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes()) + + auto scaling_choice_a = ScalingType::BlockWise1x128; + auto scaling_choice_b = ScalingType::BlockWise1x128; + + // Check scale strides (including stride=1 small cases) + _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a); + _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b); + + _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); + + return out; +} + +Tensor& +_scaled_block128x128_block1x128( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + const bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32, shape K//128 + std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl; + std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl; + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat, + "scale_a must have shape ", ceil_div(mat_a.sizes()[0], 128), " x ", ceil_div(mat_a.sizes()[1], 128), " Float elements, got ", scale_a.sizes()) + TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat, + "scale_b must have shape ", ceil_div(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes()) + + auto scaling_choice_a = ScalingType::BlockWise128x128; + auto scaling_choice_b = ScalingType::BlockWise1x128; + + // Check scale strides (including stride=1 small cases) + _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a); + _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b); + + _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); + + return out; +} + +Tensor& +_scaled_block1x128_block128x128( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const Tensor& scale_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + const bool use_fast_accum, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128 + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat, + "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes()) + TORCH_CHECK_VALUE(scale_b.sizes()[0] == mat_b.sizes()[0] / 128 && scale_b.sizes()[1] == mat_b.sizes()[1] / 128 && scale_b.scalar_type() == kFloat, + "scale_b must have shape ", mat_b.sizes()[0] / 128, " x ", mat_b.sizes()[1] / 128, " Float elements, got ", scale_b.sizes()) + + auto scaling_choice_a = ScalingType::BlockWise1x128; + auto scaling_choice_b = ScalingType::BlockWise128x128; + + // Check scale strides (including stride=1 small cases) + _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a); + _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b); + + _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out); + + return out; +} + +Tensor& +_scaled_mxfp8_mxfp8( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const SwizzleType swizzle_a, + const Tensor& scale_b, const SwizzleType swizzle_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + Tensor& out) { + // Restrictions: + // A, B are FP8, scales are e8m0, A: shape K//32, B: K, N//32 + // Scales must be swizzled + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + +#ifdef USE_ROCM + auto scale_a_elems = ceil_div(mat_a.size(0), 32) * mat_a.size(1); + auto scale_b_elems = ceil_div(mat_b.size(1), 32) * mat_b.size(0); +#else + auto scale_a_elems = round_up(mat_a.size(0), 128) * round_up(ceil_div(mat_a.size(1), 32), 4); + auto scale_b_elems = round_up(mat_b.size(1), 128) * round_up(ceil_div(mat_b.size(0), 32), 4); +#endif + TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(), + "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel()); + TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(), + "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel()); + +#ifndef USE_ROCM + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format"); +#endif + + TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(), + "For Blockwise scaling both scales should be contiguous"); + + TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype); + + auto scaling_choice_a = ScalingType::BlockWise1x32; + auto scaling_choice_b = ScalingType::BlockWise1x32; + +#ifdef USE_ROCM +#if ROCM_VERSION >= 70000 + TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), + "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); + + TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 && + mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0, + "Matrix dimensions must be multiples of 32 for block-wise scaling"); + + TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 || + out.scalar_type() == ScalarType::Half, + "Block-wise scaling only supports BFloat16 or Half output types"); +#else + TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); +#endif +#endif + + return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); +} + + +Tensor& +_scaled_mxfp4_mxfp4( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const SwizzleType swizzle_a, + const Tensor& scale_b, const SwizzleType swizzle_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + Tensor& out) { +#ifndef USE_ROCM + TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only"); +#endif + // Restrictions: + // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32 + TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + + auto scale_a_elems = ceil_div(2 * mat_a.size(0), 32) * mat_a.size(1); + auto scale_b_elems = ceil_div(2 * mat_b.size(1), 32) * mat_b.size(0); + TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(), + "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel()); + TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(), + "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel()); + + TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(), + "For Blockwise scaling both scales should be contiguous"); + + TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype); + + auto scaling_choice_a = ScalingType::BlockWise1x32; + auto scaling_choice_b = ScalingType::BlockWise1x32; + +#if ROCM_VERSION >= 70000 + TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}), + "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950"); + + TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 && + mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0, + "Matrix dimensions must be multiples of 32 for block-wise scaling"); + + TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 || + out.scalar_type() == ScalarType::Half, + "Block-wise scaling only supports BFloat16 or Half output types"); +#else + TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later"); +#endif + + return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out); +} + +Tensor& +_scaled_nvfp4_nvfp4( + const Tensor& mat_a, const Tensor& mat_b, + const Tensor& scale_a, const SwizzleType swizzle_a, + const Tensor& scale_b, const SwizzleType swizzle_b, + const std::optional& bias, + const c10::ScalarType out_dtype, + Tensor& out, + const std::optional& global_scale_a = std::nullopt, + const std::optional& global_scale_b = std::nullopt) { +#ifdef USE_ROCM + TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM"); +#endif + std::optional alpha = std::nullopt; + // Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise, + // if this is "And" we would silently do nothing in the case where one global scale is + // passed and not the other. + if (global_scale_a.has_value() || global_scale_b.has_value()) { + TORCH_CHECK_VALUE(global_scale_a.has_value(), + "For two-level-scaled NVFP4, global_scale_a must have a value"); + TORCH_CHECK_VALUE(global_scale_b.has_value(), + "For two-level-scaled NVFP4, global_scale_b must have a value"); + alpha = global_scale_a.value().mul(global_scale_b.value()); + } + // Restrictions: + // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32 + // Scales must be swizzled + TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ", + mat_a.scalar_type(), mat_b.scalar_type()); + // Note: fp4x2 format, need to double the K dimension for checking purposes. + auto scale_a_elems = round_up(mat_a.size(0), 128) * round_up(ceil_div(mat_a.size(1) * 2, 16), 4); + auto scale_b_elems = round_up(mat_b.size(1), 128) * round_up(ceil_div(mat_b.size(0) * 2, 16), 4); + TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(), + "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel()); + TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(), + "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel()); + + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format"); + TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format"); + + TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(), + "For Blockwise scaling both scales should be contiguous"); + + auto scaling_choice_a = ScalingType::BlockWise1x16; + auto scaling_choice_b = ScalingType::BlockWise1x16; + return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha); +} + + +// V2: Computes matrix multiply + bias while applying scaling to input and output matrices +// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default. +// If output matrix type is 16 or 32-bit type, scale_result is not applied. +// Known limitations: +// - Only works if mat1 is row-major and mat2 is column-major +// - Only works if matrices sizes are divisible by 32 +// - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0) +// and scale_b should have size = to mat2.size(1) +// Arguments: +// - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme +// - `scale_recipe_a`: An integer corresponding to an enum describing the scaling scheme used for `scale_a` +// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_a` +// - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme +// - `scale_recipe_b`: An integer corresponding to an enum describing the scaling scheme used for `scale_b` +// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_b` +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type +// - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable. +// - `out`: a reference to the output tensor +Tensor& +_scaled_mm_cuda_v2_out( + const Tensor& mat_a, const Tensor& mat_b, + ArrayRef scale_a, + IntArrayRef scale_recipe_a, + IntArrayRef swizzle_a, + ArrayRef scale_b, + IntArrayRef scale_recipe_b, + IntArrayRef swizzle_b, + const std::optional& bias, + const std::optional out_dtype, + IntArrayRef contraction_dim, + bool use_fast_accum, + Tensor& out) { + // Check sizes + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK_NOT_IMPLEMENTED(allowed_device, + "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); + TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix"); + TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix"); + + // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels + // do not support this case). + if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) { + // `out` was created with `at::empty`. In the case where we are multiplying + // MxK by KxN and K is the zero dim, we need to initialize here to properly + // return a tensor of zeros. + at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)}); + if (mat_a.size(1) == 0) { + out.zero_(); + } + + return out; + } + + // Check if the input matrix sizes can be multiplied + // - if optional contraction dims are provided, use those + // -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not available. + if (contraction_dim.size() > 0) { + TORCH_CHECK_VALUE(contraction_dim.size() == 2, "contraction_dim must have exactly 2 elements"); + auto mat_a_dim = contraction_dim[0]; + auto mat_b_dim = contraction_dim[1]; + TORCH_CHECK_VALUE( + mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), "mat_a and mat_b shapes cannot be multiplied (", + mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ") ", + "with contraction dims mat_a: ", mat_a_dim, ", mat_b: ", mat_b_dim); + } else { + TORCH_CHECK_VALUE( + mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied (", + mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")"); + } + + TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1], + " but got ", bias->numel()); + TORCH_CHECK_VALUE( + mat_a.sizes()[1] % 16 == 0, + "Expected trailing dimension of mat1 to be divisible by 16 ", + "but got mat1 shape: (", + mat_a.sizes()[0], + "x", + mat_a.sizes()[1], + ")."); + TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x", + mat_b.sizes()[1], ") must be divisible by 16"); + + // TODO(slayton): Existing checks, not sure if they should really be here. + TORCH_CHECK_VALUE(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type"); + TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) || mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2, + "Expected mat_a to be Float8 or Float4_x2 matrix got ", mat_a.scalar_type()); + TORCH_CHECK_VALUE(isFloat8Type(mat_b.scalar_type()) || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2, + "Expected mat_b to be Float8 or Float4_x2 matrix got ", mat_b.scalar_type()); +#ifndef USE_ROCM + // Type restrictions imposed by CuBLASLt as of CUDA-12.1 + TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float8_e5m2 || mat_b.scalar_type() != ScalarType::Float8_e5m2, + "Multiplication of two Float8_e5m2 matrices is not supported"); +#endif + if (use_fast_accum) { + TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat_b.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat_a` or `mat_b` tensors have the `Float4_e2m1fn_x2` dtype."); + } +#ifdef USE_ROCM + if (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2) { + TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 70000, + "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above"); + } + if (mat_a.scalar_type() == ScalarType::Float8_e5m2 || mat_b.scalar_type() == ScalarType::Float8_e5m2) { + TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500, + "Float8_e5m2 is only supported for ROCm 6.5 and above"); + } + if (mat_a.scalar_type() == ScalarType::Float8_e4m3fn || mat_b.scalar_type() == ScalarType::Float8_e4m3fn) { + TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500, + "Float8_e4m3fn is only supported for ROCm 6.5 and above"); + } +#endif + if (bias) { + TORCH_CHECK_VALUE(out.scalar_type() != kFloat, + "Bias is not supported when out_dtype is set to Float32"); + + TORCH_CHECK_VALUE(bias->scalar_type() == ScalarType::BFloat16 || + bias->scalar_type() == ScalarType::Half, + "Bias must be BFloat16 or Half, but got ", bias->scalar_type()); + + TORCH_CHECK_VALUE((out.scalar_type() != kFloat && + out.scalar_type() != ScalarType::BFloat16) || + bias->scalar_type() == ScalarType::BFloat16, + "Bias must be BFloat16 to compute ", out.scalar_type(), + " output, but got ", bias->scalar_type()); + + TORCH_CHECK_VALUE(out.scalar_type() != ScalarType::Half || + bias->scalar_type() == ScalarType::Half, + "Bias must be Float16 to compute ", out.scalar_type(), + " output, but got ", bias->scalar_type()); + } + { + auto bias_ = bias.value_or(Tensor()); + + // NOLINTNEXTLINE(*c-array*) + TensorArg targs[]{{out, "out", 0}, {mat_a, "mat_a", 1}, {mat_b, "mat_b", 2}, + {bias_, "bias", 3}, {scale_a[0], "scale_a", 4}, {scale_b[0], "scale_b", 5}}; + checkAllSameGPU(__func__, targs); + } + + auto out_dtype_ = out_dtype.value_or(at::ScalarType::BFloat16); + + // Conversion of implicitly-defined enums to explicit + auto scale_recipe_a_enum = convert_int_to_enum(scale_recipe_a); + auto swizzle_a_enum = convert_int_to_enum(swizzle_a); + auto scale_recipe_b_enum = convert_int_to_enum(scale_recipe_b); + auto swizzle_b_enum = convert_int_to_enum(swizzle_b); + + // at this point we can start working out what we want to be doing + // Try to do as few steps as possible. + // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed. + // Do this via a list of defined (name, acceptance, concrete_impl) tuples. + bool found_impl = false; + ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE; + + for (const auto& fn_entry : scale_kernel_dispatch) { + const auto [name, accept_fn, scaled_gemm_impl] = fn_entry; + bool ok = accept_fn(mat_a.scalar_type(), + scale_recipe_a_enum, + scale_a, + mat_b.scalar_type(), + scale_recipe_b_enum, + scale_b); + if (ok) { + gemm_impl = scaled_gemm_impl; + found_impl = true; + break; + } + } + TORCH_CHECK_VALUE( + found_impl, + "Invalid scaling configuration.\n" + "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n" + "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", 1) and scale_b should be (1, ", mat_b.size(1), "), and both should be contiguous.\n" + "- For BlockWise 1x128 scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", ", ceil_div(mat_a.size(1), 128), ") and scale_b should be (", ceil_div(mat_b.size(0), 128), ", ", mat_b.size(1), "), and both should be outer-dim-major.\n" + "- For BlockWise 128x128 scaling, a and b should be float8, scales should be float, scale_a should be (", ceil_div(mat_a.size(0), 128), ", ", ceil_div(mat_a.size(1), 128), ") and scale_b should be (", ceil_div(mat_b.size(0), 128), ", ", ceil_div(mat_b.size(1), 128), "), and both should be near-inner-dim-major (with 16-byte aligned strides).\n" + "- For Blockwise 1x32 scaling, a and b should be float8, scales should be float8_e8m0fnu, scale_a should have ", round_up(mat_a.size(0), 128) * round_up(ceil_div(mat_a.size(1), 32), 4), " elements and scale_b should have ", round_up(mat_b.size(1), 128) * round_up(ceil_div(mat_b.size(0), 32), 4), " elements, and both should be contiguous.\n" + "- For Blockwise 1x16 scaling, a and b should be float4 (packed 2x), scales should be float8_e4m3fn, scale_a should have ", round_up(mat_a.size(0), 128) * round_up(ceil_div(mat_a.size(1) * 2, 16), 4), " elements and scale_b should have ", round_up(mat_b.size(1), 128) * round_up(ceil_div(mat_b.size(0) * 2, 16), 4), " elements, and both should be contiguous.\n" + "Got mat_a.dtype()=", mat_a.scalar_type(), ", scale_a[0].dtype()=", scale_a[0].scalar_type(), ", scale_a[0].size()=", scale_a[0].sizes(), ", scale_a[0].stride()=", scale_a[0].strides(), ", ", + "mat_b.dtype()=", mat_b.scalar_type(), ", scale_b[0].dtype()=", scale_b[0].scalar_type(), ", scale_b[0].size()=", scale_b[0].sizes(), " and scale_b[0].stride()=", scale_b[0].strides() + ); + + at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)}); + + auto bias_ = bias.value_or(Tensor()); + + // dispatch to appropriate lower-level calls for error checking & execution + if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) { + return _scaled_tensorwise_tensorwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out); + } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) { + return _scaled_rowwise_rowwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out); + } else if (gemm_impl == ScaledGemmImplementation::BLOCK_128x128_1x128) { + return _scaled_block128x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out); + } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_128x128) { + return _scaled_block1x128_block128x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out); + } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_1x128) { + return _scaled_block1x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out); + } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) { + return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); + } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) { + return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out, + scale_a[1], scale_b[1]); + } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) { + return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); + } else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) { + return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out); + } else { + TORCH_CHECK_VALUE(false, "Invalid state - found an implementation, but not really"); + } +} + +Tensor +_scaled_mm_cuda_v2( + const Tensor& mat_a, const Tensor& mat_b, + ArrayRef scale_a, + IntArrayRef scale_recipe_a, + IntArrayRef swizzle_a, + ArrayRef scale_b, + IntArrayRef scale_recipe_b, + IntArrayRef swizzle_b, + const std::optional& bias, + const std::optional out_dtype, + IntArrayRef contraction_dim, + bool use_fast_accum) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_)); + + return _scaled_mm_cuda_v2_out( + mat_a, mat_b, + scale_a, scale_recipe_a, swizzle_a, + scale_b, scale_recipe_b, swizzle_b, + bias, + out_dtype, + contraction_dim, + use_fast_accum, + out); +} + +// 2d-2d and 2d-3d +// scaling=MXFP8 +// CUDA-only +Tensor& +_mx8_mx8_bf16_grouped_mm_fbgemm( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const SwizzleType& swizzle_a, + const Tensor& scale_b, + const SwizzleType& swizzle_b, + const std::optional& offs, + Tensor& out) { + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + bool b_is_3d = mat_b.dim() == 3; + bool is_2d_2d = a_is_2d && b_is_2d; + bool is_2d_3d = a_is_2d && b_is_3d; + TORCH_CHECK_VALUE(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases"); + TORCH_CHECK_VALUE(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets"); + TORCH_CHECK_VALUE(out.scalar_type() == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm"); + // MXFP8 expects float8_e8m0fnu scales. + TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu, + "For MXFP8 grouped gemm, both scales must be float8_e8m0fnu tensors."); +#ifdef USE_ROCM + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE && swizzle_b == SwizzleType::NO_SWIZZLE, + "For ROCM MXFP8 grouped gemm, both scale swizzle types must be SWIZZLE_NONE"); +#else + TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4 && swizzle_b == SwizzleType::SWIZZLE_32_4_4, + "For CUDA MXFP8 grouped gemm, both scale swizzle types must be SWIZZLE_32_4_4"); +#endif + +#if defined(USE_FBGEMM_GENAI) and !defined(USE_ROCM) + fbgemm_gpu::mx8mx8bf16_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs.value(), + out); +#else + TORCH_CHECK_NOT_IMPLEMENTED(false, "mxfp8_mxfp8 grouped gemm requires compile with USE_FBGEMM_GENAI"); +#endif + return out; +} + +// 2d-2d and 2d-3d cases +// scaling=rowwise +// CUDA-only +Tensor& +_f8_f8_bf16_rowwise_grouped_mm_cuda( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& offs, + const std::optional& bias, + const bool use_fast_accum, + Tensor& out) { + TORCH_CHECK_VALUE(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type()); + TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type()); + + at::cuda::detail::f8f8bf16_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs, + bias, + use_fast_accum, + out); + return out; +} + +// 2d-2d and 2d-3d cases +// scaling=rowwise +// only being called for rocm +Tensor& +_f8_f8_bf16_rowwise_grouped_mm_rocm( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& offs, + Tensor& out) { + TORCH_CHECK_VALUE(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type()); + TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type()); + +#if defined(USE_FBGEMM_GENAI) && defined(USE_ROCM) + fbgemm_gpu::f8f8bf16_rowwise_grouped_mm( + mat_a, + // FBGEMM expects B matrix shape to be (.., N, K) + mat_b.transpose(-2, -1), + scale_a, + scale_b, + offs, + out); +#else + TORCH_CHECK_NOT_IMPLEMENTED(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM") +#endif + return out; + +} + +// Dispatch f8 x f8 -> bf16 row-wise scaled to rocm/cuda +Tensor& +_f8_f8_bf16_rowwise_grouped_mm( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& offs, + const std::optional& bias, + bool use_fast_accum, + Tensor& out) { + // FP8 per-tensor and per-row scaling expect fp32 scales. + TORCH_CHECK_VALUE(scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat, + "For grouped FP8 rowwise, both scales must be float32 tensors"); +#ifndef USE_ROCM + return _f8_f8_bf16_rowwise_grouped_mm_cuda( + mat_a, + mat_b, + scale_a, + scale_b, + offs, + bias, + use_fast_accum, + out); +#else + // NOTE: ignore use_fast_accum + TORCH_CHECK_VALUE(!bias.has_value(), "ROCM grouped gemm does not support bias") + return _f8_f8_bf16_rowwise_grouped_mm_rocm( + mat_a, + mat_b, + scale_a, + scale_b, + offs, + out); +#endif +} + +Tensor +_scaled_grouped_mm_cuda( + const Tensor& mat_a, + const Tensor& mat_b, + const Tensor& scale_a, + const Tensor& scale_b, + const std::optional& offs, + const std::optional& bias, + const std::optional& scale_result, + std::optional out_dtype, + bool use_fast_accum) { + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); + TORCH_CHECK_VALUE(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+"); + + TORCH_CHECK_VALUE(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); + TORCH_CHECK_VALUE(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); + TORCH_CHECK_VALUE(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK_VALUE(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + + // NOTE(slayton): For sub-1B formats want contraction_dim argument? + if (!a_is_2d || !b_is_2d) { + TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + } + TORCH_CHECK_VALUE( + mat_a.size(-1) % 16 == 0, + "Expected trailing dimension of mat_a to be divisible by 16 ", + "but got mat1 shape: (", + mat_a.sizes(), + ")."); + TORCH_CHECK_VALUE(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0, + "Expected mat_b shape to be divisible by 16 ", + "but got mat_b shape: (", + mat_b.sizes(), + ")."); + + + TORCH_CHECK_VALUE(!bias.has_value(), "Bias not supported yet"); + TORCH_CHECK_VALUE(!scale_result.has_value(), "Scale result not supported yet"); + TORCH_CHECK_VALUE(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix"); + + // NOTE: mxfp8 x mxfp8 requires (and asserts later) that offsets is present. + // for rowwise, no offsets implies 3d-3d and is handled by lower-level + // routines + if (offs.has_value()) { + TORCH_CHECK_VALUE(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK_VALUE(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + // FP8 per-tensor and per-row scaling expect fp32 scales. + // MXFP8 expects float8_e8m0fnu scales. + TORCH_CHECK_VALUE( + (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) || + (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu), + "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors."); + + const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; + check_scale(mat_a, scale_a, 0 ,0, scale_multiplier); + check_scale(mat_b, scale_b, 1, 1, scale_multiplier); + + const auto out_dtype_ = out_dtype.value_or(kBFloat16); + TORCH_CHECK_VALUE(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + +#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM) + // MXFP8 grouped GEMM dispatching + bool is_mx8mx8bf16 = ( + mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn && + scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu + ); +#else + bool is_mx8mx8bf16 = false; +#endif + + if (is_mx8mx8bf16) { + // Note: Passing implied SwizzleType here, correctness of scale previously checked + // in `check_scale` call + return _mx8_mx8_bf16_grouped_mm_fbgemm( + mat_a, + mat_b, + scale_a, + SwizzleType::SWIZZLE_32_4_4, + scale_b, + SwizzleType::SWIZZLE_32_4_4, + offs.value(), + out); + } + + // If we're not MXFP8, then we're row-wise scaling. + return _f8_f8_bf16_rowwise_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs, + bias, + use_fast_accum, + out); +} + +namespace { + +std::array, 2> scale_grouped_kernel_dispatch = {{ + { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE}, + { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}}; + +} // anonymous namespace + +Tensor +_scaled_grouped_mm_cuda_v2( + const Tensor& mat_a, const Tensor& mat_b, + ArrayRef scale_a, + IntArrayRef scale_recipe_a, + IntArrayRef swizzle_a, + ArrayRef scale_b, + IntArrayRef scale_recipe_b, + IntArrayRef swizzle_b, + const std::optional& offs, + const std::optional& bias, + const std::optional out_dtype, + IntArrayRef contraction_dim, + bool use_fast_accum) { + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); + TORCH_CHECK_VALUE(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+"); + + TORCH_CHECK_VALUE(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); + TORCH_CHECK_VALUE(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); + TORCH_CHECK_VALUE(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK_VALUE(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + + // NOTE(slayton): For sub-1B formats want contraction_dim argument? + if (!a_is_2d || !b_is_2d) { + if (contraction_dim.size() > 0) { + const int dim_a = contraction_dim[0], dim_b = mat_b.size(contraction_dim[1]); + TORCH_CHECK_VALUE(mat_a.size(dim_a) == mat_b.size(dim_b), + "Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ", + mat_b.size(dim_b)); + // Note: only (-1, -2) is currently supported + TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only"); + } else { + TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + } + } + TORCH_CHECK_VALUE( + mat_a.size(-1) % 16 == 0, + "Expected trailing dimension of mat_a to be divisible by 16 ", + "but got mat1 shape: (", + mat_a.sizes(), + ")."); + TORCH_CHECK_VALUE(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0, + "Expected mat_b shape to be divisible by 16 ", + "but got mat_b shape: (", + mat_b.sizes(), + ")."); + + TORCH_CHECK_VALUE(!bias.has_value(), "Bias not supported yet"); + TORCH_CHECK_VALUE(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix"); + + // NOTE: mxfp8 x mxfp8 requires (and asserts later) that offsets is present. + // for rowwise, no offsets implies 3d-3d and is handled by lower-level + // routines + if (offs.has_value()) { + TORCH_CHECK_VALUE(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK_VALUE(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + + const auto out_dtype_ = out_dtype.value_or(kBFloat16); + TORCH_CHECK_VALUE(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + + // Conversion of implicitly-defined enums to explicit + auto scale_recipe_a_enum = convert_int_to_enum(scale_recipe_a); + auto swizzle_a_enum = convert_int_to_enum(swizzle_a); + auto scale_recipe_b_enum = convert_int_to_enum(scale_recipe_b); + auto swizzle_b_enum = convert_int_to_enum(swizzle_b); + + // at this point we can start working out what we want to be doing + // Try to do as few steps as possible. + // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed. + // Do this via a list of defined (name, acceptance, concrete_impl) tuples. + ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE; + for (const auto& fn_entry : scale_grouped_kernel_dispatch) { + const auto [name, accept_fn, scaled_gemm_impl] = fn_entry; + bool ok = accept_fn(mat_a.scalar_type(), + scale_recipe_a_enum, + scale_a, + mat_b.scalar_type(), + scale_recipe_b_enum, + scale_b); + if (ok) { + gemm_impl = scaled_gemm_impl; + break; + } + } + TORCH_CHECK_VALUE(gemm_impl != ScaledGemmImplementation::NONE, + "No gemm implementation was found"); + + switch (gemm_impl) { + case ScaledGemmImplementation::ROWWISE_ROWWISE: { + const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; + _check_scales_fp8_rowwise(mat_a, scale_a[0], 0 /* dim */ , 0 /* arg_idx */, scale_multiplier); + _check_scales_fp8_rowwise(mat_b, scale_b[0], 1 /* dim */ , 1 /* arg_idx */, scale_multiplier); + return _f8_f8_bf16_rowwise_grouped_mm( + mat_a, + mat_b, + scale_a[0], + scale_b[0], + offs, + bias, + use_fast_accum, + out); + } + case ScaledGemmImplementation::MXFP8_MXFP8: { + _check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */); + _check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */); + return _mx8_mx8_bf16_grouped_mm_fbgemm( + mat_a, + mat_b, + scale_a[0], + swizzle_a_enum[0], + scale_b[0], + swizzle_b_enum[0], + offs.value(), + out); + } + default: + TORCH_CHECK_NOT_IMPLEMENTED(false, + "_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here"); + } +} + +Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); + bool a_b_and_out_are_bf16 = ( + mat_a.dtype() == at::kBFloat16 && + mat_b.dtype() == at::kBFloat16 && + out_dtype.value_or(at::kBFloat16) == at::kBFloat16 + ); +#ifndef USE_ROCM + bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; +#else + // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used. + // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm + bool use_fast_path = false; +#endif + const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + if (use_fast_path) { + // fast path, no d2h sync needed + at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); + } else { + _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); + } + return out; +} + +static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional& self_baddbmm = std::nullopt) { + // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm + TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); + TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); + + const auto batch1_sizes = batch1.sizes(); + const auto batch2_sizes = batch2.sizes(); + + int64_t bs = batch1_sizes[0]; + int64_t contraction_size = batch1_sizes[2]; + int64_t res_rows = batch1_sizes[1]; + int64_t res_cols = batch2_sizes[2]; + std::vector output_size {bs, res_rows, res_cols}; + + TORCH_CHECK(batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size, + "Expected size for first two dimensions of batch2 tensor to be: [", + bs, ", ", contraction_size, "] but got: [", batch2_sizes[0], ", ", batch2_sizes[1], "]."); + + TORCH_CHECK(batch1.scalar_type() == batch2.scalar_type(), "batch1 and batch2 must have the same dtype"); + + TORCH_CHECK(out_dtype == batch1.scalar_type() || + (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)), "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs"); + if (!is_bmm && self_baddbmm.has_value()) { + const auto& self = self_baddbmm.value(); + TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor"); + TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output"); + } +} + +Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) { + IntArrayRef batch1_sizes = batch1.sizes(); + IntArrayRef batch2_sizes = batch2.sizes(); + + Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype)); + return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out); +} + +Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) { + baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true); + Scalar beta(0.0); + Scalar alpha(1.0); + { + NoNamesGuard guard; + baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha); + } + + return out; +} + +Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) { + // We need to copy the tensor + Tensor out = self.clone().to(self.options().dtype(out_dtype)); + + return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out); +} + +Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { + baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self); { NoNamesGuard guard; baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha); @@ -1853,6 +3082,12 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy } Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) { + TORCH_CHECK(self.dim() == 2, "self must be a matrix, got ", self.dim(), "-D tensor"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + TORCH_CHECK( + self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", + self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same"); TORCH_CHECK(out_dtype == self.scalar_type() || @@ -1861,7 +3096,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); - addmm_out_cuda_impl(const_cast(out), out, self, mat2, 0, 1); + addmm_out_cuda_impl(out, out, self, mat2, 0, 1); return out; } @@ -1872,6 +3107,14 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m } Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) { + TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type()); + TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type()); + TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")"); + TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor"); TORCH_CHECK(out_dtype == self.scalar_type() || (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)), diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index 12ad84a15b18..ee28c5c1693f 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { dtypes[i] = iter.dtype(i); } auto offset_calc = ::make_offset_calculator(iter); +#ifdef USE_ROCM + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1); + arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1); + arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1); + arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out0, result0); + c10::cast_and_store(dtypes[0], out1, result1); + c10::cast_and_store(dtypes[0], out2, result2); + c10::cast_and_store(dtypes[0], out3, result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out, result); + } + }); +#else launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { auto offsets = offset_calc.get(idx); void* out = data[0] + offsets[0]; arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); c10::cast_and_store(dtypes[0], out, result); }); +#endif } } diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 59b0426bab1f..62a07e1e28c8 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) { }); } +#ifdef USE_ROCM +void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { + gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) { + return static_cast(value); + }); +} +void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { + gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) { + return static_cast(value); + }); +} +#endif + void float8_copy_kernel_cuda(TensorIteratorBase &iter) { ScalarType dtype = iter.dtype(0); ScalarType other_dtype = iter.dtype(1); @@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) { } else { float16_copy_kernel_cuda(iter); } - } else if (isBitsType(dtype)) { + } +#ifdef USE_ROCM + else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) { + if (iter.dtype(1) == kBFloat16) { + bfloat16tofloat32_copy_kernel_cuda(iter); + } else { + float16tofloat32_copy_kernel_cuda(iter); + } + } +#endif + else if (isBitsType(dtype)) { TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index 1ed6a7722d9b..344906a2a4df 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -38,12 +38,41 @@ __device__ inline int min(int a, int b) { #define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched #endif -static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) { - return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1; +template +static __device__ inline index_t p_start(index_t size, int pad, int kernel, int dilation, int stride) { + const auto kernel_extent = static_cast((kernel - 1) * dilation + 1); + return (size + pad < kernel_extent) ? index_t(0) : (size + pad - kernel_extent) / stride + 1; } -static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) { - return min((size + pad) / stride + 1, pooled_size); +template +static __device__ inline index_t p_end(index_t size, int pad, index_t pooled_size, int stride) { + return std::min((size + pad) / stride + 1, pooled_size); +} + +static inline bool can_use_int32_nhwc( + int64_t nbatch, int64_t channels, + int64_t height, int64_t width, + int64_t pooled_height, int64_t pooled_width, + int64_t in_stride_n, int64_t in_stride_c, + int64_t in_stride_h, int64_t in_stride_w) +{ + constexpr int64_t int_max = std::numeric_limits::max(); + + int64_t max_intra_batch = + (height ? (height - 1) * in_stride_h : 0) + + (width ? (width - 1) * in_stride_w : 0) + + (channels? (channels - 1) * in_stride_c : 0); + + int64_t max_input_offset = (nbatch ? (nbatch - 1) * in_stride_n : 0) + max_intra_batch; + + if (max_input_offset > int_max) return false; + + int64_t out_batch_stride = pooled_height * pooled_width * channels; + if ((nbatch ? (nbatch - 1) * out_batch_stride : 0) > int_max) return false; + + if (height * width > int_max) return false; + + return true; } // kernels borrowed from Caffe @@ -85,21 +114,25 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom } } -template +template C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS) -__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch, - const int64_t channels, const int64_t height, - const int64_t width, const int pooled_height, const int pooled_width, - const int kernel_h, const int kernel_w, const int stride_h, - const int stride_w, const int pad_h, const int pad_w, - const int dilation_h, const int dilation_w, - const int in_stride_n, const int in_stride_c, - const int in_stride_h, const int in_stride_w, - const int kernel_stride_C, const int kernel_size_C, - scalar_t* top_data, int64_t* top_mask) { - extern __shared__ int smem[]; - int *out_mask_cached = smem; - scalar_t *out_cached = reinterpret_cast(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]); +__global__ void max_pool_forward_nhwc( + const scalar_t* bottom_data, + const int nbatch, + const index_t channels, const index_t height, const index_t width, + const index_t pooled_height, const index_t pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w, + const int dilation_h, const int dilation_w, + const index_t in_stride_n, const index_t in_stride_c, + const index_t in_stride_h, const index_t in_stride_w, + const int kernel_stride_C, const int kernel_size_C, + scalar_t* top_data, int64_t* top_mask) { + + extern __shared__ unsigned char smem_raw[]; + index_t *out_mask_cached = reinterpret_cast(smem_raw); + scalar_t *out_cached = reinterpret_cast( + out_mask_cached + kernel_size_C*blockDim.x*blockDim.y*blockDim.z); // flattening cta for pre-computation & smem initialization; int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z); @@ -118,26 +151,26 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba int channel_id = blockIdx.x / nbatch; int channel_offset = threadIdx.x + channel_id * blockDim.x; - top_data = top_data + batch_id * pooled_height * pooled_width * channels; - top_mask = top_mask + batch_id * pooled_height * pooled_width * channels; - bottom_data = bottom_data + batch_id * in_stride_n; + top_data = top_data + static_cast(batch_id) * (pooled_height * pooled_width * channels); + top_mask = top_mask + static_cast(batch_id) * (pooled_height * pooled_width * channels); + bottom_data = bottom_data + static_cast(batch_id) * in_stride_n; - out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x]; - out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x]; + out_cached += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x; + out_mask_cached += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x; - int oH = (pooled_height + gridDim.z-1) / gridDim.z; - int oW = (pooled_width + gridDim.y-1) / gridDim.y; + int oH = (static_cast(pooled_height) + gridDim.z - 1) / gridDim.z; + int oW = (static_cast(pooled_width) + gridDim.y - 1) / gridDim.y; int ostartH = threadIdx.z + blockIdx.z*oH; - int oendH = ::min(ostartH+oH, pooled_height); + int oendH = ::min(ostartH+oH, static_cast(pooled_height)); int ostartW = threadIdx.y + blockIdx.y*oW; - int oendW = ::min(ostartW+oW, pooled_width); + int oendW = ::min(ostartW+oW, static_cast(pooled_width)); for (int oh = ostartH; oh < oendH; oh+=blockDim.z) { - int hstart = oh * stride_h - pad_h; - int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); + index_t hstart = static_cast(oh) * stride_h - pad_h; + index_t hend = std::min(hstart + static_cast((kernel_h - 1) * dilation_h + 1), height); for (int ow = ostartW; ow < oendW; ow+=blockDim.y) { - int wstart = ow * stride_w - pad_w; - int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); + index_t wstart = static_cast(ow) * stride_w - pad_w; + index_t wend = std::min(wstart + static_cast((kernel_w - 1) * dilation_w + 1), width); while(hstart < 0) hstart += dilation_h; while(wstart < 0) @@ -185,12 +218,12 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba // Else do it Non-Prefetch... else #endif - for (int ih = hstart; ih < hend; ih += dilation_h) { - for (int iw = wstart; iw < wend; iw += dilation_w) { + for (index_t ih = hstart; ih < hend; ih += dilation_h) { + for (index_t iw = wstart; iw < wend; iw += dilation_w) { int cached_index = threadIdx.x; const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w; - for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) { - scalar_t val = ptr_input[c*in_stride_c]; + for (index_t c = channel_offset; c < channels; c += static_cast(blockDim.x) * kernel_stride_C) { + scalar_t val = ptr_input[c * in_stride_c]; if ((val > out_cached[cached_index]) || at::_isnan(val)) { out_cached[cached_index] = val; out_mask_cached[cached_index] = ih * width + iw; @@ -200,15 +233,15 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba } } - scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels; - int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels; + scalar_t *ptr_output_data = top_data + (static_cast(oh) * pooled_width + ow) * channels; + int64_t *ptr_output_mask = top_mask + (static_cast(oh) * pooled_width + ow) * channels; int cached_index = threadIdx.x; - for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) { + for (index_t c = channel_offset; c < channels; c += static_cast(blockDim.x) * kernel_stride_C) { ptr_output_data[c] = out_cached[cached_index]; - ptr_output_mask[c] = out_mask_cached[cached_index]; + ptr_output_mask[c] = static_cast(out_mask_cached[cached_index]); out_cached[cached_index] = at::numeric_limits::lower_bound(); - out_mask_cached[cached_index] = 0; + out_mask_cached[cached_index] = index_t(0); cached_index += blockDim.x; } } @@ -216,7 +249,7 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba } -static const int BLOCK_THREADS = 256; +static constexpr int BLOCK_THREADS = 256; template #if defined (USE_ROCM) @@ -462,6 +495,11 @@ const Tensor& indices) { maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); const dim3 block(block_x, block_y, block_z); + bool use_int32 = can_use_int32_nhwc( + nbatch, nInputPlane, inputHeight, inputWidth, + outputHeight, outputWidth, + in_stride_n, in_stride_c, in_stride_h, in_stride_w); + int kernel_stride_C = ceil_div( safe_downcast(nInputPlane), block_x * 4); int kernel_size_C = ceil_div( @@ -476,18 +514,41 @@ const Tensor& indices) { ceil_div(safe_downcast(outputHeight), block_z*BLOCK_STRIDE_FWD)); const dim3 grid(grid_x, grid_y, grid_z); - size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); - AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); - - max_pool_forward_nhwc - <<>>( - input_data, nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - in_stride_n, in_stride_c, - in_stride_h, in_stride_w, - kernel_stride_C, kernel_size_C, - output_data, indices_data); + size_t shmem_size; + size_t mask_elems = static_cast(kernel_size_C) * block_x * block_y * block_z; + + if (use_int32) { + shmem_size = mask_elems * (sizeof(int32_t) + sizeof(scalar_t)); + TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + "shared memory too small"); + max_pool_forward_nhwc + <<>>( + input_data, static_cast(nbatch), + static_cast(nInputPlane), + static_cast(inputHeight), + static_cast(inputWidth), + static_cast(outputHeight), + static_cast(outputWidth), + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + static_cast(in_stride_n), + static_cast(in_stride_c), + static_cast(in_stride_h), + static_cast(in_stride_w), + kernel_stride_C, kernel_size_C, + output_data, indices_data); + } else { + shmem_size = mask_elems * (sizeof(int64_t) + sizeof(scalar_t)); + TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + "shared memory too small"); + max_pool_forward_nhwc + <<>>( + input_data, static_cast(nbatch), + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + in_stride_n, in_stride_c, in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + output_data, indices_data); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 602dfd6e5288..65b0e1441de7 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -15,9 +15,7 @@ #include #include -#if CUB_SUPPORTS_SCAN_BY_KEY() #include -#endif #ifndef AT_PER_OPERATOR_HEADERS #include @@ -36,9 +34,9 @@ namespace at::native { namespace { #if defined(USE_ROCM) -static const int BLOCKDIMY = 16; +static constexpr int BLOCKDIMY = 16; #else -static const int BLOCKDIMY = 32; +static constexpr int BLOCKDIMY = 32; #endif template @@ -240,10 +238,6 @@ __global__ void renorm_kernel( } // anonymous namespace -#if !CUB_SUPPORTS_SCAN_BY_KEY() -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); -#endif Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_, int64_t num_weights, int64_t padding_idx, @@ -306,7 +300,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); -#if CUB_SUPPORTS_SCAN_BY_KEY() AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -333,11 +326,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice num_indices ); }); -#else - AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () { - embedding_dense_backward_cuda_scan(sorted_indices, count); - }); -#endif } return embedding_backward_cuda_kernel(grad, orig_indices, diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu index 76307a0bf549..6ce419137345 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu @@ -10,9 +10,7 @@ #include -#if CUB_SUPPORTS_UNIQUE_BY_KEY() #include -#endif #ifndef AT_PER_OPERATOR_HEADERS #include @@ -88,9 +86,9 @@ __global__ void compute_grad_weight_bags( const int64_t stride_warped) { int64_t num_of_segments = *num_of_segments_ptr; - const int gid = blockIdx.x * blockDim.x + threadIdx.x; - const int id = gid / stride_warped; - const int startFeature = gid % stride_warped; + const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int64_t id = gid / stride_warped; + const int64_t startFeature = gid % stride_warped; if (startFeature >= stride) { return; } @@ -134,9 +132,9 @@ __global__ void compute_grad_weight( int64_t num_of_segments = *num_of_segments_ptr; using accscalar_t = acc_type; - const int gid = blockIdx.x * blockDim.x + threadIdx.x; - const int id = gid / stride_warped; - const int startFeature = gid % stride_warped; + const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int64_t id = gid / stride_warped; + const int64_t startFeature = gid % stride_warped; if (startFeature >= stride) { return; } @@ -167,9 +165,9 @@ __global__ void sum_and_scatter( int64_t num_of_segments = *num_of_segments_ptr; int64_t num_of_partial_segments = *num_of_partial_segments_ptr; - const int gid = blockIdx.x * blockDim.x + threadIdx.x; - const int id = gid / stride_warped; - const int startFeature = gid % stride_warped; + const int64_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int64_t id = gid / stride_warped; + const int64_t startFeature = gid % stride_warped; if (startFeature >= stride) { return; } @@ -196,18 +194,9 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm partials_per_segment_offset[num_of_segments-1]; } -#if !CUB_SUPPORTS_UNIQUE_BY_KEY() -__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) { - *num_of_segments_ptr = num_of_segments; -} -#endif } // anon namespace -#if !CUB_SUPPORTS_UNIQUE_BY_KEY() -template -int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets); -#endif Tensor embedding_backward_cuda_kernel( const Tensor &grad, @@ -234,20 +223,12 @@ Tensor embedding_backward_cuda_kernel( auto segment_offsets = at::empty({numel}, orig_indices.options()); auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong)); int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr(); -#if !CUB_SUPPORTS_UNIQUE_BY_KEY() - AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { - int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key(sorted_indices, segment_offsets); - write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); -#else AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () { cuda::cub::unique_by_key( sorted_indices.const_data_ptr(), thrust::make_counting_iterator(0), segment_offsets.mutable_data_ptr(), num_of_segments_ptr, sorted_indices.numel()); }); -#endif int64_t max_segments = std::min(numel, num_weights); diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index fb92c7488a15..ab3747df031e 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -31,16 +31,10 @@ #include -#if CUB_SUPPORTS_SCAN_BY_KEY() #include -#endif namespace at::native { -#if !CUB_SUPPORTS_SCAN_BY_KEY() -template -void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count); -#endif namespace { @@ -199,7 +193,6 @@ Tensor embedding_bag_backward_cuda_sum_avg( if (scale_grad_by_freq) { count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT); -#if CUB_SUPPORTS_SCAN_BY_KEY() AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); @@ -226,11 +219,6 @@ Tensor embedding_bag_backward_cuda_sum_avg( num_indices ); }); -#else - AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () { - embedding_dense_backward_cuda_scan(sorted_indices, count); - }); -#endif } return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices, count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag, diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu index 7ee02b02b41f..227d42247ebd 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu @@ -51,7 +51,7 @@ std::vector foreach_tensor_list_op( Op(), alpha.to()); - return tensor_lists[2]; + return std::move(tensor_lists[2]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu index 80d748dd3579..9ac0e875b2d6 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu @@ -45,7 +45,7 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op(), scalar.to()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu index dcb93188b5e6..b28aa690630b 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu @@ -33,7 +33,7 @@ std::vector foreach_binary_op( } tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(vec_res); + tensor_lists.emplace_back(std::move(vec_res)); using opmath_t = at::opmath_type; multi_tensor_apply<2, opmath_t>( @@ -46,7 +46,7 @@ std::vector foreach_binary_op( /* res_arg_index */ 1>(), Op()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu index ad5eeee5ebec..bc6bd3789125 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu @@ -56,7 +56,7 @@ std::vector foreach_binary_op( Op(), scalar.data_ptr(), alpha.to()); - return tensor_lists[1]; + return std::move(tensor_lists[1]); } template class Op> diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu index 7a3276c44750..7f563f55d556 100644 --- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu +++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu @@ -57,7 +57,7 @@ std::vector foreach_pointwise_op( scalar.to()); }); - return tensor_lists[3]; + return std::move(tensor_lists[3]); } template